

missing features in openai-api, PR #52
@997a653f425fae2f4ada5664fffbd61376ae386f
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -6,8 +6,7 @@ |
6 | 6 |
import time |
7 | 7 |
import io |
8 | 8 |
import soundfile as sf |
9 |
- |
|
10 |
- |
|
9 |
+import math |
|
11 | 10 |
|
12 | 11 |
@lru_cache |
13 | 12 |
def load_audio(fname): |
... | ... | @@ -153,24 +152,34 @@ |
153 | 152 |
class OpenaiApiASR(ASRBase): |
154 | 153 |
"""Uses OpenAI's Whisper API for audio transcription.""" |
155 | 154 |
|
156 |
- def __init__(self, modelsize=None, lan=None, cache_dir=None, model_dir=None, response_format="verbose_json", temperature=0): |
|
157 |
- self.modelname = "whisper-1" # modelsize is not used but kept for interface consistency |
|
155 |
+ def __init__(self, lan=None, response_format="verbose_json", temperature=0, logfile=sys.stderr): |
|
156 |
+ self.logfile = logfile |
|
157 |
+ |
|
158 |
+ self.modelname = "whisper-1" |
|
158 | 159 |
self.language = lan # ISO-639-1 language code |
159 | 160 |
self.response_format = response_format |
160 | 161 |
self.temperature = temperature |
161 |
- self.model = self.load_model(modelsize, cache_dir, model_dir) |
|
162 |
+ |
|
163 |
+ self.load_model() |
|
164 |
+ |
|
165 |
+ self.use_vad = False |
|
166 |
+ |
|
167 |
+ # reset the task in set_translate_task |
|
168 |
+ self.task = "transcribe" |
|
162 | 169 |
|
163 | 170 |
def load_model(self, *args, **kwargs): |
164 | 171 |
from openai import OpenAI |
165 | 172 |
self.client = OpenAI() |
166 |
- # Since we're using the OpenAI API, there's no model to load locally. |
|
167 |
- print("Model configuration is set to use the OpenAI Whisper API.") |
|
173 |
+ |
|
174 |
+ self.transcribed_seconds = 0 # for logging how many seconds were processed by API, to know the cost |
|
175 |
+ |
|
168 | 176 |
|
169 | 177 |
def ts_words(self, segments): |
170 | 178 |
o = [] |
171 | 179 |
for segment in segments: |
172 |
- # Skip segments containing no speech |
|
173 |
- if segment["no_speech_prob"] > 0.8: |
|
180 |
+ # If VAD on, skip segments containing no speech. |
|
181 |
+ # TODO: threshold can be set from outside |
|
182 |
+ if self.use_vad and segment["no_speech_prob"] > 0.8: |
|
174 | 183 |
continue |
175 | 184 |
|
176 | 185 |
# Splitting the text into words and filtering out empty strings |
... | ... | @@ -203,22 +212,38 @@ |
203 | 212 |
sf.write(buffer, audio_data, samplerate=16000, format='WAV', subtype='PCM_16') |
204 | 213 |
buffer.seek(0) # Reset buffer's position to the beginning |
205 | 214 |
|
206 |
- # Prepare transcription parameters |
|
207 |
- transcription_params = { |
|
215 |
+ self.transcribed_seconds += math.ceil(len(audio_data)/16000) # it rounds up to the whole seconds |
|
216 |
+ |
|
217 |
+ params = { |
|
208 | 218 |
"model": self.modelname, |
209 | 219 |
"file": buffer, |
210 | 220 |
"response_format": self.response_format, |
211 | 221 |
"temperature": self.temperature |
212 | 222 |
} |
213 |
- if self.language: |
|
223 |
+ if self.task != "translate" and self.language: |
|
214 | 224 |
transcription_params["language"] = self.language |
215 | 225 |
if prompt: |
216 | 226 |
transcription_params["prompt"] = prompt |
217 | 227 |
|
218 |
- # Perform the transcription |
|
219 |
- transcript = self.client.audio.transcriptions.create(**transcription_params) |
|
228 |
+ if self.task == "translate": |
|
229 |
+ proc = self.client.audio.translations |
|
230 |
+ else: |
|
231 |
+ proc = self.client.audio.transcriptions |
|
232 |
+ |
|
233 |
+ # Process transcription/translation |
|
234 |
+ |
|
235 |
+ transcript = proc.create(**params) |
|
236 |
+ print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds ",file=self.logfile) |
|
220 | 237 |
|
221 | 238 |
return transcript.segments |
239 |
+ |
|
240 |
+ def use_vad(self): |
|
241 |
+ self.use_vad = True |
|
242 |
+ |
|
243 |
+ def set_translate_task(self): |
|
244 |
+ self.task = "translate" |
|
245 |
+ |
|
246 |
+ |
|
222 | 247 |
|
223 | 248 |
|
224 | 249 |
class HypothesisBuffer: |
... | ... | @@ -563,34 +588,33 @@ |
563 | 588 |
duration = len(load_audio(audio_path))/SAMPLING_RATE |
564 | 589 |
print("Audio duration is: %2.2f seconds" % duration, file=logfile) |
565 | 590 |
|
566 |
- size = args.model |
|
567 | 591 |
language = args.lan |
568 | 592 |
|
569 |
- t = time.time() |
|
570 |
- print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True) |
|
571 |
- |
|
572 |
- if args.backend == "faster-whisper": |
|
573 |
- asr_cls = FasterWhisperASR |
|
574 |
- elif args.backend == "openai-api": |
|
575 |
- asr_cls = OpenaiApiASR |
|
593 |
+ if args.backend == "openai-api": |
|
594 |
+ print("Using OpenAI API.",file=logfile) |
|
595 |
+ asr = OpenaiApiASR(lan=language) |
|
576 | 596 |
else: |
577 |
- asr_cls = WhisperTimestampedASR |
|
597 |
+ if args.backend == "faster-whisper": |
|
598 |
+ asr_cls = FasterWhisperASR |
|
599 |
+ else: |
|
600 |
+ asr_cls = WhisperTimestampedASR |
|
578 | 601 |
|
579 |
- asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir) |
|
602 |
+ size = args.model |
|
603 |
+ t = time.time() |
|
604 |
+ print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True) |
|
605 |
+ asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir) |
|
606 |
+ e = time.time() |
|
607 |
+ print(f"done. It took {round(e-t,2)} seconds.",file=logfile) |
|
608 |
+ |
|
609 |
+ if args.vad: |
|
610 |
+ print("setting VAD filter",file=logfile) |
|
611 |
+ asr.use_vad() |
|
580 | 612 |
|
581 | 613 |
if args.task == "translate": |
582 | 614 |
asr.set_translate_task() |
583 | 615 |
tgt_language = "en" # Whisper translates into English |
584 | 616 |
else: |
585 | 617 |
tgt_language = language # Whisper transcribes in this language |
586 |
- |
|
587 |
- |
|
588 |
- e = time.time() |
|
589 |
- print(f"done. It took {round(e-t,2)} seconds.",file=logfile) |
|
590 |
- |
|
591 |
- if args.vad: |
|
592 |
- print("setting VAD filter",file=logfile) |
|
593 |
- asr.use_vad() |
|
594 | 618 |
|
595 | 619 |
|
596 | 620 |
min_chunk = args.min_chunk_size |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?