

Make --vad work with --backend openai-api
@5da3267add56cdd63aaef11cee53b508ec95a4be
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -162,7 +162,7 @@ |
162 | 162 |
|
163 | 163 |
self.load_model() |
164 | 164 |
|
165 |
- self.use_vad = False |
|
165 |
+ self.use_vad_opt = False |
|
166 | 166 |
|
167 | 167 |
# reset the task in set_translate_task |
168 | 168 |
self.task = "transcribe" |
... | ... | @@ -175,21 +175,27 @@ |
175 | 175 |
|
176 | 176 |
|
177 | 177 |
def ts_words(self, segments): |
178 |
- o = [] |
|
179 |
- # If VAD on, skip segments containing no speech. |
|
180 |
- # TODO: threshold can be set from outside |
|
181 |
- # TODO: Make VAD work again with word-level timestamps |
|
182 |
- #if self.use_vad and segment["no_speech_prob"] > 0.8: |
|
183 |
- # continue |
|
178 |
+ no_speech_segments = [] |
|
179 |
+ if self.use_vad_opt: |
|
180 |
+ for segment in segments.segments: |
|
181 |
+ # TODO: threshold can be set from outside |
|
182 |
+ if segment["no_speech_prob"] > 0.8: |
|
183 |
+ no_speech_segments.append((segment.get("start"), segment.get("end"))) |
|
184 | 184 |
|
185 |
- for word in segments: |
|
186 |
- o.append((word.get("start"), word.get("end"), word.get("word"))) |
|
185 |
+ o = [] |
|
186 |
+ for word in segments.words: |
|
187 |
+ start = word.get("start") |
|
188 |
+ end = word.get("end") |
|
189 |
+ if any(s[0] <= start <= s[1] for s in no_speech_segments): |
|
190 |
+ # print("Skipping word", word.get("word"), "because it's in a no-speech segment") |
|
191 |
+ continue |
|
192 |
+ o.append((start, end, word.get("word"))) |
|
187 | 193 |
|
188 | 194 |
return o |
189 | 195 |
|
190 | 196 |
|
191 | 197 |
def segments_end_ts(self, res): |
192 |
- return [s["end"] for s in res] |
|
198 |
+ return [s["end"] for s in res.words] |
|
193 | 199 |
|
194 | 200 |
def transcribe(self, audio_data, prompt=None, *args, **kwargs): |
195 | 201 |
# Write the audio data to a buffer |
... | ... | @@ -205,7 +211,7 @@ |
205 | 211 |
"file": buffer, |
206 | 212 |
"response_format": self.response_format, |
207 | 213 |
"temperature": self.temperature, |
208 |
- "timestamp_granularities": ["word"] |
|
214 |
+ "timestamp_granularities": ["word", "segment"] |
|
209 | 215 |
} |
210 | 216 |
if self.task != "translate" and self.language: |
211 | 217 |
params["language"] = self.language |
... | ... | @@ -221,10 +227,10 @@ |
221 | 227 |
transcript = proc.create(**params) |
222 | 228 |
print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile) |
223 | 229 |
|
224 |
- return transcript.words |
|
230 |
+ return transcript |
|
225 | 231 |
|
226 | 232 |
def use_vad(self): |
227 |
- self.use_vad = True |
|
233 |
+ self.use_vad_opt = True |
|
228 | 234 |
|
229 | 235 |
def set_translate_task(self): |
230 | 236 |
self.task = "translate" |
... | ... | @@ -592,9 +598,9 @@ |
592 | 598 |
e = time.time() |
593 | 599 |
print(f"done. It took {round(e-t,2)} seconds.",file=logfile) |
594 | 600 |
|
595 |
- if args.vad: |
|
596 |
- print("setting VAD filter",file=logfile) |
|
597 |
- asr.use_vad() |
|
601 |
+ if args.vad: |
|
602 |
+ print("setting VAD filter",file=logfile) |
|
603 |
+ asr.use_vad() |
|
598 | 604 |
|
599 | 605 |
if args.task == "translate": |
600 | 606 |
asr.set_translate_task() |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?