

Merge branch 'opeanai-api2' into opeanai-api
@e11a5bae1e44ea0171f3e7a0a0c55f9199076166
--- README.md
+++ README.md
... | ... | @@ -91,7 +91,7 @@ |
91 | 91 |
--model_dir MODEL_DIR |
92 | 92 |
Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter. |
93 | 93 |
--lan LAN, --language LAN |
94 |
- Language code for transcription, e.g. en,de,cs. |
|
94 |
+ Source language code, e.g. en,de,cs, or 'auto' for language detection. |
|
95 | 95 |
--task {transcribe,translate} |
96 | 96 |
Transcribe or translate. |
97 | 97 |
--backend {faster-whisper,whisper_timestamped,openai-api} |
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -31,7 +31,10 @@ |
31 | 31 |
self.logfile = logfile |
32 | 32 |
|
33 | 33 |
self.transcribe_kargs = {} |
34 |
- self.original_language = lan |
|
34 |
+ if lan == "auto": |
|
35 |
+ self.original_language = None |
|
36 |
+ else: |
|
37 |
+ self.original_language = lan |
|
35 | 38 |
|
36 | 39 |
self.model = self.load_model(modelsize, cache_dir, model_dir) |
37 | 40 |
|
... | ... | @@ -119,8 +122,11 @@ |
119 | 122 |
return model |
120 | 123 |
|
121 | 124 |
def transcribe(self, audio, init_prompt=""): |
125 |
+ |
|
122 | 126 |
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01) |
123 | 127 |
segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs) |
128 |
+ #print(info) # info contains language detection result |
|
129 |
+ |
|
124 | 130 |
return list(segments) |
125 | 131 |
|
126 | 132 |
def ts_words(self, segments): |
... | ... | @@ -146,17 +152,17 @@ |
146 | 152 |
class OpenaiApiASR(ASRBase): |
147 | 153 |
"""Uses OpenAI's Whisper API for audio transcription.""" |
148 | 154 |
|
149 |
- def __init__(self, lan=None, response_format="verbose_json", temperature=0, logfile=sys.stderr): |
|
155 |
+ def __init__(self, lan=None, temperature=0, logfile=sys.stderr): |
|
150 | 156 |
self.logfile = logfile |
151 | 157 |
|
152 | 158 |
self.modelname = "whisper-1" |
153 |
- self.language = lan # ISO-639-1 language code |
|
154 |
- self.response_format = response_format |
|
159 |
+ self.original_language = None if lan == "auto" else lan # ISO-639-1 language code |
|
160 |
+ self.response_format = "verbose_json" |
|
155 | 161 |
self.temperature = temperature |
156 | 162 |
|
157 | 163 |
self.load_model() |
158 | 164 |
|
159 |
- self.use_vad = False |
|
165 |
+ self.use_vad_opt = False |
|
160 | 166 |
|
161 | 167 |
# reset the task in set_translate_task |
162 | 168 |
self.task = "transcribe" |
... | ... | @@ -169,35 +175,26 @@ |
169 | 175 |
|
170 | 176 |
|
171 | 177 |
def ts_words(self, segments): |
178 |
+ no_speech_segments = [] |
|
179 |
+ if self.use_vad_opt: |
|
180 |
+ for segment in segments.segments: |
|
181 |
+ # TODO: threshold can be set from outside |
|
182 |
+ if segment["no_speech_prob"] > 0.8: |
|
183 |
+ no_speech_segments.append((segment.get("start"), segment.get("end"))) |
|
184 |
+ |
|
172 | 185 |
o = [] |
173 |
- for segment in segments: |
|
174 |
- # If VAD on, skip segments containing no speech. |
|
175 |
- # TODO: threshold can be set from outside |
|
176 |
- if self.use_vad and segment["no_speech_prob"] > 0.8: |
|
186 |
+ for word in segments.words: |
|
187 |
+ start = word.get("start") |
|
188 |
+ end = word.get("end") |
|
189 |
+ if any(s[0] <= start <= s[1] for s in no_speech_segments): |
|
190 |
+ # print("Skipping word", word.get("word"), "because it's in a no-speech segment") |
|
177 | 191 |
continue |
178 |
- |
|
179 |
- # Splitting the text into words and filtering out empty strings |
|
180 |
- words = [word.strip() for word in segment["text"].split() if word.strip()] |
|
181 |
- |
|
182 |
- if not words: |
|
183 |
- continue |
|
184 |
- |
|
185 |
- # Assign start and end times for each word |
|
186 |
- # We only have timestamps per segment, so interpolating start and end-times |
|
187 |
- # assuming equal duration per word |
|
188 |
- segment_duration = segment["end"] - segment["start"] |
|
189 |
- duration_per_word = segment_duration / len(words) |
|
190 |
- start_time = segment["start"] |
|
191 |
- for word in words: |
|
192 |
- end_time = start_time + duration_per_word |
|
193 |
- o.append((start_time, end_time, word)) |
|
194 |
- start_time = end_time |
|
195 |
- |
|
192 |
+ o.append((start, end, word.get("word"))) |
|
196 | 193 |
return o |
197 | 194 |
|
198 | 195 |
|
199 | 196 |
def segments_end_ts(self, res): |
200 |
- return [s["end"] for s in res] |
|
197 |
+ return [s["end"] for s in res.words] |
|
201 | 198 |
|
202 | 199 |
def transcribe(self, audio_data, prompt=None, *args, **kwargs): |
203 | 200 |
# Write the audio data to a buffer |
... | ... | @@ -212,10 +209,11 @@ |
212 | 209 |
"model": self.modelname, |
213 | 210 |
"file": buffer, |
214 | 211 |
"response_format": self.response_format, |
215 |
- "temperature": self.temperature |
|
212 |
+ "temperature": self.temperature, |
|
213 |
+ "timestamp_granularities": ["word", "segment"] |
|
216 | 214 |
} |
217 |
- if self.task != "translate" and self.language: |
|
218 |
- params["language"] = self.language |
|
215 |
+ if self.task != "translate" and self.original_language: |
|
216 |
+ params["language"] = self.original_language |
|
219 | 217 |
if prompt: |
220 | 218 |
params["prompt"] = prompt |
221 | 219 |
|
... | ... | @@ -225,14 +223,13 @@ |
225 | 223 |
proc = self.client.audio.transcriptions |
226 | 224 |
|
227 | 225 |
# Process transcription/translation |
228 |
- |
|
229 | 226 |
transcript = proc.create(**params) |
230 | 227 |
print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile) |
231 | 228 |
|
232 |
- return transcript.segments |
|
229 |
+ return transcript |
|
233 | 230 |
|
234 | 231 |
def use_vad(self): |
235 |
- self.use_vad = True |
|
232 |
+ self.use_vad_opt = True |
|
236 | 233 |
|
237 | 234 |
def set_translate_task(self): |
238 | 235 |
self.task = "translate" |
... | ... | @@ -548,7 +545,7 @@ |
548 | 545 |
parser.add_argument('--model', type=str, default='large-v2', choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large".split(","),help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.") |
549 | 546 |
parser.add_argument('--model_cache_dir', type=str, default=None, help="Overriding the default model cache dir where models downloaded from the hub are saved") |
550 | 547 |
parser.add_argument('--model_dir', type=str, default=None, help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.") |
551 |
- parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.") |
|
548 |
+ parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.") |
|
552 | 549 |
parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.") |
553 | 550 |
parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.') |
554 | 551 |
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.') |
... | ... | @@ -600,9 +597,9 @@ |
600 | 597 |
e = time.time() |
601 | 598 |
print(f"done. It took {round(e-t,2)} seconds.",file=logfile) |
602 | 599 |
|
603 |
- if args.vad: |
|
604 |
- print("setting VAD filter",file=logfile) |
|
605 |
- asr.use_vad() |
|
600 |
+ if args.vad: |
|
601 |
+ print("setting VAD filter",file=logfile) |
|
602 |
+ asr.use_vad() |
|
606 | 603 |
|
607 | 604 |
if args.task == "translate": |
608 | 605 |
asr.set_translate_task() |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?