

buffer trimming options + most recommendable default
evaluated on ESIC dev2, 27 docs
@d6ec999ab8eb3a6ee9b94fc96577366d9f5acf7a
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -212,7 +212,7 @@ |
212 | 212 |
|
213 | 213 |
SAMPLING_RATE = 16000 |
214 | 214 |
|
215 |
- def __init__(self, asr, tokenizer, logfile=sys.stderr): |
|
215 |
+ def __init__(self, asr, tokenizer=None, logfile=sys.stderr, buffer_trimming=("segment", 15)): |
|
216 | 216 |
"""asr: WhisperASR object |
217 | 217 |
tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer. |
218 | 218 |
logfile: where to store the log. |
... | ... | @@ -222,6 +222,8 @@ |
222 | 222 |
self.logfile = logfile |
223 | 223 |
|
224 | 224 |
self.init() |
225 |
+ |
|
226 |
+ self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming |
|
225 | 227 |
|
226 | 228 |
def init(self): |
227 | 229 |
"""run this when starting or restarting processing""" |
... | ... | @@ -278,36 +280,18 @@ |
278 | 280 |
print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True) |
279 | 281 |
|
280 | 282 |
# there is a newly confirmed text |
281 |
- if o: |
|
282 |
- # we trim all the completed sentences from the audio buffer |
|
283 |
- self.chunk_completed_sentence() |
|
284 | 283 |
|
285 |
- # ...segments could be considered |
|
286 |
- #self.chunk_completed_segment(res) |
|
284 |
+ if o and self.buffer_trimming_way == "sentence": # trim the completed sentences |
|
285 |
+ if len(self.audio_buffer)/self.SAMPLING_RATE > self.buffer_trimming_sec: # longer than this |
|
286 |
+ self.chunk_completed_sentence() |
|
287 | 287 |
|
288 |
- # |
|
289 |
-# self.silence_iters = 0 |
|
290 |
- |
|
291 |
- # this was an attempt to trim silence/non-linguistic noise detected by the fact that Whisper doesn't transcribe anything for 3-times in a row. |
|
292 |
- # It seemed not working better, or needs to be debugged. |
|
293 |
- |
|
294 |
-# elif self.transcript_buffer.complete(): |
|
295 |
-# self.silence_iters = 0 |
|
296 |
-# elif not self.transcript_buffer.complete(): |
|
297 |
-# # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True) |
|
298 |
-# self.silence_iters += 1 |
|
299 |
-# if self.silence_iters >= 3: |
|
300 |
-# n = self.last_chunked_at |
|
301 |
-## self.chunk_completed_sentence() |
|
302 |
-## if n == self.last_chunked_at: |
|
303 |
-# self.chunk_at(self.last_chunked_at+self.chunk) |
|
304 |
-# print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.logfile) |
|
305 |
-## self.silence_iters = 0 |
|
306 |
- |
|
307 |
- |
|
308 |
- # if the audio buffer is longer than 30s, trim it... |
|
309 |
- if len(self.audio_buffer)/self.SAMPLING_RATE > 30: |
|
310 |
- # ...on the last completed segment (labeled by Whisper) |
|
288 |
+ |
|
289 |
+ if self.buffer_trimming_way == "segment": |
|
290 |
+ s = self.buffer_trimming_sec # trim the completed segments longer than s, |
|
291 |
+ else: |
|
292 |
+ s = 30 # if the audio buffer is longer than 30s, trim it |
|
293 |
+ |
|
294 |
+ if len(self.audio_buffer)/self.SAMPLING_RATE > s: |
|
311 | 295 |
self.chunk_completed_segment(res) |
312 | 296 |
|
313 | 297 |
# alternative: on any word |
... | ... | @@ -317,7 +301,7 @@ |
317 | 301 |
#while k>0 and self.commited[k][1] > l: |
318 | 302 |
# k -= 1 |
319 | 303 |
#t = self.commited[k][1] |
320 |
- print(f"chunking because of len",file=self.logfile) |
|
304 |
+ print(f"chunking segment",file=self.logfile) |
|
321 | 305 |
#self.chunk_at(t) |
322 | 306 |
|
323 | 307 |
print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile) |
... | ... | @@ -477,6 +461,8 @@ |
477 | 461 |
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.') |
478 | 462 |
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.') |
479 | 463 |
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.') |
464 |
+ parser.add_argument('--buffer_trimming', type=str, default="sentence", choices=["sentence", "segment"],help='Buffer trimming strategy') |
|
465 |
+ parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming lenght threshold in seconds. If buffer length longer, trimming sentence/segment is triggered.') |
|
480 | 466 |
args = parser.parse_args() |
481 | 467 |
|
482 | 468 |
# reset to store stderr to different file stream, e.g. open(os.devnull,"w") |
... | ... | @@ -521,7 +507,7 @@ |
521 | 507 |
|
522 | 508 |
|
523 | 509 |
min_chunk = args.min_chunk_size |
524 |
- online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile) |
|
510 |
+ online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
525 | 511 |
|
526 | 512 |
|
527 | 513 |
# load the audio into the LRU cache before we start the timer |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?