Commit @f3c76f7551d796c194c18656a2a62b33125645d5 - yjyoon/whisper_streaming

Tijs Zwinkels 2024-02-10

Use OpenAI api word-level timestamps

@f3c76f7551d796c194c18656a2a62b33125645d5

46598ad

f3c76f7

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


 
     def ts_words(self, segments):
         o = []
-        for segment in segments:
-            # If VAD on, skip segments containing no speech. 
-            # TODO: threshold can be set from outside
-            if self.use_vad and segment["no_speech_prob"] > 0.8:
-                continue
+        # If VAD on, skip segments containing no speech. 
+        # TODO: threshold can be set from outside
+        # TODO: Make VAD work again with word-level timestamps
+        #if self.use_vad and segment["no_speech_prob"] > 0.8:
+        #    continue
 
-            # Splitting the text into words and filtering out empty strings
-            words = [word.strip() for word in segment["text"].split() if word.strip()]
-
-            if not words:
-                continue
-
-            # Assign start and end times for each word
-            # We only have timestamps per segment, so interpolating start and end-times
-
-            
-            segment_duration = segment["end"] - segment["start"]
-            total_characters = sum(len(word) for word in words)
-            duration_per_character = segment_duration / total_characters
-            start_time = segment["start"]
-            for word in words:
-                end_time = start_time + duration_per_character * len(word)
-                o.append((start_time, end_time, word))
-                start_time = end_time
+        for word in segments:
+            o.append((word.get("start"), word.get("end"), word.get("word")))
 
         return o
 

             "model": self.modelname,
             "file": buffer,
             "response_format": self.response_format,
-            "temperature": self.temperature
+            "temperature": self.temperature,
+            "timestamp_granularities": ["word"]
         }
         if self.task != "translate" and self.language:
             params["language"] = self.language

             proc = self.client.audio.transcriptions
 
         # Process transcription/translation
-
         transcript = proc.create(**params)
         print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile)
 
-        return transcript.segments
+        return transcript.words
 
     def use_vad(self):
         self.use_vad = True

Add a comment

Open 0
Closed 0

List

...	...	@@ -176,30 +176,14 @@
176	176
177	177	def ts_words(self, segments):
178	178	o = []
179		- for segment in segments:
180		- # If VAD on, skip segments containing no speech.
181		- # TODO: threshold can be set from outside
182		- if self.use_vad and segment["no_speech_prob"] > 0.8:
183		- continue
	179	+ # If VAD on, skip segments containing no speech.
	180	+ # TODO: threshold can be set from outside
	181	+ # TODO: Make VAD work again with word-level timestamps
	182	+ #if self.use_vad and segment["no_speech_prob"] > 0.8:
	183	+ # continue
184	184
185		- # Splitting the text into words and filtering out empty strings
186		- words = [word.strip() for word in segment["text"].split() if word.strip()]
187		-
188		- if not words:
189		- continue
190		-
191		- # Assign start and end times for each word
192		- # We only have timestamps per segment, so interpolating start and end-times
193		-
194		-
195		- segment_duration = segment["end"] - segment["start"]
196		- total_characters = sum(len(word) for word in words)
197		- duration_per_character = segment_duration / total_characters
198		- start_time = segment["start"]
199		- for word in words:
200		- end_time = start_time + duration_per_character * len(word)
201		- o.append((start_time, end_time, word))
202		- start_time = end_time
	185	+ for word in segments:
	186	+ o.append((word.get("start"), word.get("end"), word.get("word")))
203	187
204	188	return o
205	189
...	...	@@ -220,7 +204,8 @@
220	204	"model": self.modelname,
221	205	"file": buffer,
222	206	"response_format": self.response_format,
223		- "temperature": self.temperature
	207	+ "temperature": self.temperature,
	208	+ "timestamp_granularities": ["word"]
224	209	}
225	210	if self.task != "translate" and self.language:
226	211	params["language"] = self.language
...	...	@@ -233,11 +218,10 @@
233	218	proc = self.client.audio.transcriptions
234	219
235	220	# Process transcription/translation
236		-
237	221	transcript = proc.create(**params)
238	222	print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile)
239	223
240		- return transcript.segments
	224	+ return transcript.words
241	225
242	226	def use_vad(self):
243	227	self.use_vad = True

Delete comment