Commit @d6ec999ab8eb3a6ee9b94fc96577366d9f5acf7a - yjyoon/whisper_streaming

Dominik Macháček 2024-01-02

buffer trimming options + most recommendable default

evaluated on ESIC dev2, 27 docs

@d6ec999ab8eb3a6ee9b94fc96577366d9f5acf7a

c236a99

d6ec999

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


 
     SAMPLING_RATE = 16000
 
-    def __init__(self, asr, tokenizer, logfile=sys.stderr):
+    def __init__(self, asr, tokenizer=None, logfile=sys.stderr, buffer_trimming=("segment", 15)):
         """asr: WhisperASR object
         tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
         logfile: where to store the log. 

         self.logfile = logfile
 
         self.init()
+
+        self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
 
     def init(self):
         """run this when starting or restarting processing"""

         print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
 
         # there is a newly confirmed text
-        if o:
-            # we trim all the completed sentences from the audio buffer
-            self.chunk_completed_sentence()
 
-            # ...segments could be considered
-            #self.chunk_completed_segment(res)
+        if o and self.buffer_trimming_way == "sentence":  # trim the completed sentences
+            if len(self.audio_buffer)/self.SAMPLING_RATE > self.buffer_trimming_sec:  # longer than this
+                self.chunk_completed_sentence()
 
-            # 
-#            self.silence_iters = 0
-
-         # this was an attempt to trim silence/non-linguistic noise detected by the fact that Whisper doesn't transcribe anything for 3-times in a row.
-         # It seemed not working better, or needs to be debugged.
-
-#        elif self.transcript_buffer.complete():
-#            self.silence_iters = 0
-#        elif not self.transcript_buffer.complete():
-#        #    print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
-#            self.silence_iters += 1
-#            if self.silence_iters >= 3:
-#                n = self.last_chunked_at
-##                self.chunk_completed_sentence()
-##                if n == self.last_chunked_at:
-#                self.chunk_at(self.last_chunked_at+self.chunk)
-#                print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.logfile)
-##                self.silence_iters = 0
-
-
-        # if the audio buffer is longer than 30s, trim it...
-        if len(self.audio_buffer)/self.SAMPLING_RATE > 30:
-            # ...on the last completed segment (labeled by Whisper)
+        
+        if self.buffer_trimming_way == "segment":
+            s = self.buffer_trimming_sec  # trim the completed segments longer than s,
+        else:
+            s = 30 # if the audio buffer is longer than 30s, trim it
+        
+        if len(self.audio_buffer)/self.SAMPLING_RATE > s:
             self.chunk_completed_segment(res)
 
             # alternative: on any word

             #while k>0 and self.commited[k][1] > l:
             #    k -= 1
             #t = self.commited[k][1] 
-            print(f"chunking because of len",file=self.logfile)
+            print(f"chunking segment",file=self.logfile)
             #self.chunk_at(t)
 
         print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)

     parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
     parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
     parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
+    parser.add_argument('--buffer_trimming', type=str, default="sentence", choices=["sentence", "segment"],help='Buffer trimming strategy')
+    parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming lenght threshold in seconds. If buffer length longer, trimming sentence/segment is triggered.')
     args = parser.parse_args()
 
     # reset to store stderr to different file stream, e.g. open(os.devnull,"w")

 
     
     min_chunk = args.min_chunk_size
-    online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile)
+    online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
 
 
     # load the audio into the LRU cache before we start the timer

Add a comment

Open 0
Closed 0

List

...	...	@@ -212,7 +212,7 @@
212	212
213	213	SAMPLING_RATE = 16000
214	214
215		- def __init__(self, asr, tokenizer, logfile=sys.stderr):
	215	+ def __init__(self, asr, tokenizer=None, logfile=sys.stderr, buffer_trimming=("segment", 15)):
216	216	"""asr: WhisperASR object
217	217	tokenizer: sentence tokenizer object for the target language. Must have a method split that behaves like the one of MosesTokenizer.
218	218	logfile: where to store the log.
...	...	@@ -222,6 +222,8 @@
222	222	self.logfile = logfile
223	223
224	224	self.init()
	225	+
	226	+ self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
225	227
226	228	def init(self):
227	229	"""run this when starting or restarting processing"""
...	...	@@ -278,36 +280,18 @@
278	280	print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
279	281
280	282	# there is a newly confirmed text
281		- if o:
282		- # we trim all the completed sentences from the audio buffer
283		- self.chunk_completed_sentence()
284	283
285		- # ...segments could be considered
286		- #self.chunk_completed_segment(res)
	284	+ if o and self.buffer_trimming_way == "sentence": # trim the completed sentences
	285	+ if len(self.audio_buffer)/self.SAMPLING_RATE > self.buffer_trimming_sec: # longer than this
	286	+ self.chunk_completed_sentence()
287	287
288		- #
289		-# self.silence_iters = 0
290		-
291		- # this was an attempt to trim silence/non-linguistic noise detected by the fact that Whisper doesn't transcribe anything for 3-times in a row.
292		- # It seemed not working better, or needs to be debugged.
293		-
294		-# elif self.transcript_buffer.complete():
295		-# self.silence_iters = 0
296		-# elif not self.transcript_buffer.complete():
297		-# # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
298		-# self.silence_iters += 1
299		-# if self.silence_iters >= 3:
300		-# n = self.last_chunked_at
301		-## self.chunk_completed_sentence()
302		-## if n == self.last_chunked_at:
303		-# self.chunk_at(self.last_chunked_at+self.chunk)
304		-# print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.logfile)
305		-## self.silence_iters = 0
306		-
307		-
308		- # if the audio buffer is longer than 30s, trim it...
309		- if len(self.audio_buffer)/self.SAMPLING_RATE > 30:
310		- # ...on the last completed segment (labeled by Whisper)
	288	+
	289	+ if self.buffer_trimming_way == "segment":
	290	+ s = self.buffer_trimming_sec # trim the completed segments longer than s,
	291	+ else:
	292	+ s = 30 # if the audio buffer is longer than 30s, trim it
	293	+
	294	+ if len(self.audio_buffer)/self.SAMPLING_RATE > s:
311	295	self.chunk_completed_segment(res)
312	296
313	297	# alternative: on any word
...	...	@@ -317,7 +301,7 @@
317	301	#while k>0 and self.commited[k][1] > l:
318	302	# k -= 1
319	303	#t = self.commited[k][1]
320		- print(f"chunking because of len",file=self.logfile)
	304	+ print(f"chunking segment",file=self.logfile)
321	305	#self.chunk_at(t)
322	306
323	307	print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
...	...	@@ -477,6 +461,8 @@
477	461	parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
478	462	parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
479	463	parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
	464	+ parser.add_argument('--buffer_trimming', type=str, default="sentence", choices=["sentence", "segment"],help='Buffer trimming strategy')
	465	+ parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming lenght threshold in seconds. If buffer length longer, trimming sentence/segment is triggered.')
480	466	args = parser.parse_args()
481	467
482	468	# reset to store stderr to different file stream, e.g. open(os.devnull,"w")
...	...	@@ -521,7 +507,7 @@
521	507
522	508
523	509	min_chunk = args.min_chunk_size
524		- online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile)
	510	+ online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
525	511
526	512
527	513	# load the audio into the LRU cache before we start the timer

Delete comment