

Use OpenAI api word-level timestamps
@f3c76f7551d796c194c18656a2a62b33125645d5
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -176,30 +176,14 @@ |
176 | 176 |
|
177 | 177 |
def ts_words(self, segments): |
178 | 178 |
o = [] |
179 |
- for segment in segments: |
|
180 |
- # If VAD on, skip segments containing no speech. |
|
181 |
- # TODO: threshold can be set from outside |
|
182 |
- if self.use_vad and segment["no_speech_prob"] > 0.8: |
|
183 |
- continue |
|
179 |
+ # If VAD on, skip segments containing no speech. |
|
180 |
+ # TODO: threshold can be set from outside |
|
181 |
+ # TODO: Make VAD work again with word-level timestamps |
|
182 |
+ #if self.use_vad and segment["no_speech_prob"] > 0.8: |
|
183 |
+ # continue |
|
184 | 184 |
|
185 |
- # Splitting the text into words and filtering out empty strings |
|
186 |
- words = [word.strip() for word in segment["text"].split() if word.strip()] |
|
187 |
- |
|
188 |
- if not words: |
|
189 |
- continue |
|
190 |
- |
|
191 |
- # Assign start and end times for each word |
|
192 |
- # We only have timestamps per segment, so interpolating start and end-times |
|
193 |
- |
|
194 |
- |
|
195 |
- segment_duration = segment["end"] - segment["start"] |
|
196 |
- total_characters = sum(len(word) for word in words) |
|
197 |
- duration_per_character = segment_duration / total_characters |
|
198 |
- start_time = segment["start"] |
|
199 |
- for word in words: |
|
200 |
- end_time = start_time + duration_per_character * len(word) |
|
201 |
- o.append((start_time, end_time, word)) |
|
202 |
- start_time = end_time |
|
185 |
+ for word in segments: |
|
186 |
+ o.append((word.get("start"), word.get("end"), word.get("word"))) |
|
203 | 187 |
|
204 | 188 |
return o |
205 | 189 |
|
... | ... | @@ -220,7 +204,8 @@ |
220 | 204 |
"model": self.modelname, |
221 | 205 |
"file": buffer, |
222 | 206 |
"response_format": self.response_format, |
223 |
- "temperature": self.temperature |
|
207 |
+ "temperature": self.temperature, |
|
208 |
+ "timestamp_granularities": ["word"] |
|
224 | 209 |
} |
225 | 210 |
if self.task != "translate" and self.language: |
226 | 211 |
params["language"] = self.language |
... | ... | @@ -233,11 +218,10 @@ |
233 | 218 |
proc = self.client.audio.transcriptions |
234 | 219 |
|
235 | 220 |
# Process transcription/translation |
236 |
- |
|
237 | 221 |
transcript = proc.create(**params) |
238 | 222 |
print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile) |
239 | 223 |
|
240 |
- return transcript.segments |
|
224 |
+ return transcript.words |
|
241 | 225 |
|
242 | 226 |
def use_vad(self): |
243 | 227 |
self.use_vad = True |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?