Commit @32191b5c6c670873b93b2deced3d7d2390b45139 - yjyoon/whisper_streaming

Alex Young 2024-04-15

Further tidying of print output, so by default there's little on the console

@32191b5c6c670873b93b2deced3d7d2390b45139

c83746b

32191b5

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


 import librosa  
 from functools import lru_cache
 import time
+import logging
 
 
 

         from whisper_timestamped import transcribe_timestamped
         self.transcribe_timestamped = transcribe_timestamped
         if model_dir is not None:
-            print("ignoring model_dir, not implemented",file=self.logfile)
+            logging.debug("ignoring model_dir, not implemented")
         return whisper.load_model(modelsize, download_root=cache_dir)
 
     def transcribe(self, audio, init_prompt=""):

     def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
         from faster_whisper import WhisperModel
         if model_dir is not None:
-            print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.logfile)
+            logging.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
             model_size_or_path = model_dir
         elif modelsize is not None:
             model_size_or_path = modelsize

                         c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
                         tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
                         if c == tail:
-                            print("removing last",i,"words:",file=self.logfile)
+                            words = []
                             for j in range(i):
-                                print("\t",self.new.pop(0),file=self.logfile)
+                                words.append(repr(self.new.pop(0)))
+                            words_msg = "\t".join(words)
+                            logging.debug(f"removing last {i} words: {words_msg}")
                             break
 
     def flush(self):

         """
 
         prompt, non_prompt = self.prompt()
-        print("PROMPT:", prompt, file=self.logfile)
-        print("CONTEXT:", non_prompt, file=self.logfile)
-        print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.logfile)
+        logging.debug(f"PROMPT: {prompt}")
+        logging.debug(f"CONTEXT: {non_prompt}")
+        logging.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
         res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
 
         # transform to [(beg,end,"word1"), ...]

         self.transcript_buffer.insert(tsw, self.buffer_time_offset)
         o = self.transcript_buffer.flush()
         self.commited.extend(o)
-        print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.logfile,flush=True)
-        print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
+        completed = self.to_flush(o)
+        logging.debug(f">>>>COMPLETE NOW: {completed}")
+        the_rest = self.to_flush(self.transcript_buffer.complete())
+        logging.debug(f"INCOMPLETE: {the_rest}")
 
         # there is a newly confirmed text
 

             #while k>0 and self.commited[k][1] > l:
             #    k -= 1
             #t = self.commited[k][1] 
-            print(f"chunking segment",file=self.logfile)
+            logging.debug(f"chunking segment")
             #self.chunk_at(t)
 
-        print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
+        logging.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
         return self.to_flush(o)
 
     def chunk_completed_sentence(self):
         if self.commited == []: return
-        print(self.commited,file=self.logfile)
+        logging.debug(self.commited)
         sents = self.words_to_sentences(self.commited)
         for s in sents:
-            print("\t\tSENT:",s,file=self.logfile)
+            logging.debug(f"\t\tSENT: {s}")
         if len(sents) < 2:
             return
         while len(sents) > 2:

         # we will continue with audio processing at this timestamp
         chunk_at = sents[-2][1]
 
-        print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.logfile)
+        logging.debug(f"--- sentence chunked at {chunk_at:2.2f}")
         self.chunk_at(chunk_at)
 
     def chunk_completed_segment(self, res):

                 ends.pop(-1)
                 e = ends[-2]+self.buffer_time_offset
             if e <= t:
-                print(f"--- segment chunked at {e:2.2f}",file=self.logfile)
+                logging.debug(f"--- segment chunked at {e:2.2f}")
                 self.chunk_at(e)
             else:
-                print(f"--- last segment not within commited area",file=self.logfile)
+                logging.debug(f"--- last segment not within commited area")
         else:
-            print(f"--- not enough segments to chunk",file=self.logfile)
+            logging.debug(f"--- not enough segments to chunk")
 
 
 

         """
         o = self.transcript_buffer.complete()
         f = self.to_flush(o)
-        print("last, noncommited:",f,file=self.logfile)
+        logging.debug("last, noncommited: {f}")
         return f
 
 

 
     # the following languages are in Whisper, but not in wtpsplit:
     if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
-        print(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.", file=sys.stderr)
+        logging.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
         lan = None
 
     from wtpsplit import WtP

     logfile = sys.stderr
 
     if args.offline and args.comp_unaware:
-        print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile)
+        logging.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
         sys.exit(1)
 
     audio_path = args.audio_path
 
     SAMPLING_RATE = 16000
     duration = len(load_audio(audio_path))/SAMPLING_RATE
-    print("Audio duration is: %2.2f seconds" % duration, file=logfile)
+    logging.info("Audio duration is: %2.2f seconds" % duration)
 
     size = args.model
     language = args.lan
 
     t = time.time()
-    print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True)
+    logging.info(f"Loading Whisper {size} model for {language}...")
 
     if args.backend == "faster-whisper":
         asr_cls = FasterWhisperASR

 
 
     e = time.time()
-    print(f"done. It took {round(e-t,2)} seconds.",file=logfile)
+    logging.info(f"done. It took {round(e-t,2)} seconds.")
 
     if args.vad:
-        print("setting VAD filter",file=logfile)
+        logging.info("setting VAD filter")
         asr.use_vad()
 
     

             print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True)
             print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True)
         else:
-            print(o,file=logfile,flush=True)
+            print("here?", o,file=logfile,flush=True)
 
     if args.offline: ## offline mode processing (for testing/debugging)
         a = load_audio(audio_path)
         online.insert_audio_chunk(a)
         try:
             o = online.process_iter()
-        except AssertionError:
-            print("assertion error",file=logfile)
-            pass
+        except AssertionError as e:
+            log.error(f"assertion error: {repr(e)}")
         else:
             output_transcript(o)
         now = None

             online.insert_audio_chunk(a)
             try:
                 o = online.process_iter()
-            except AssertionError:
-                print("assertion error",file=logfile)
+            except AssertionError as e:
+                logging.error(f"assertion error: {repr(e)}")
                 pass
             else:
                 output_transcript(o, now=end)
 
-            print(f"## last processed {end:.2f}s",file=logfile,flush=True)
+            logging.debug(f"## last processed {end:.2f}s")
 
             if end >= duration:
                 break

 
             try:
                 o = online.process_iter()
-            except AssertionError:
-                print("assertion error",file=logfile)
+            except AssertionError as e:
+                logging.error(f"assertion error: {e}")
                 pass
             else:
                 output_transcript(o)
             now = time.time() - start
-            print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True)
+            logging.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
 
             if end >= duration:
                 break

c83746b

32191b5

whisper_online_server.py

--- whisper_online_server.py

+++ whisper_online_server.py


 if args.backend == "faster-whisper":
     from faster_whisper import WhisperModel
     asr_cls = FasterWhisperASR
+    logging.getLogger("faster_whisper").setLevel(logging.WARNING)
 else:
     import whisper
     import whisper_timestamped

     # warm up the ASR, because the very first transcribe takes much more time than the other
     asr.transcribe(a)
 else:
-    logging.info("Whisper is not warmed up")
+    logging.debug("Whisper is not warmed up")
 
 
 ######### Server objects

         out = []
         while sum(len(x) for x in out) < self.min_chunk*SAMPLING_RATE:
             raw_bytes = self.connection.non_blocking_receive_audio()
-            print(raw_bytes[:10])
-            print(len(raw_bytes))
             if not raw_bytes:
                 break
             sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")

             print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
             return "%1.0f %1.0f %s" % (beg,end,o[2])
         else:
-            print(o,file=sys.stderr,flush=True)
+            # No text, so no output
             return None
 
     def send_result(self, o):

         while True:
             a = self.receive_audio_chunk()
             if a is None:
-                print("break here",file=sys.stderr)
                 break
             self.online_asr_proc.insert_audio_chunk(a)
             o = online.process_iter()
             try:
                 self.send_result(o)
             except BrokenPipeError:
-                print("broken pipe -- connection closed?",file=sys.stderr)
+                logging.info("broken pipe -- connection closed?")
                 break
 
 #        o = online.finish()  # this should be working

Add a comment

Open 0
Closed 0

List

...	...	@@ -39,6 +39,7 @@
39	39	if args.backend == "faster-whisper":
40	40	from faster_whisper import WhisperModel
41	41	asr_cls = FasterWhisperASR
	42	+ logging.getLogger("faster_whisper").setLevel(logging.WARNING)
42	43	else:
43	44	import whisper
44	45	import whisper_timestamped
...	...	@@ -80,7 +81,7 @@
80	81	# warm up the ASR, because the very first transcribe takes much more time than the other
81	82	asr.transcribe(a)
82	83	else:
83		- logging.info("Whisper is not warmed up")
	84	+ logging.debug("Whisper is not warmed up")
84	85
85	86
86	87	######### Server objects
...	...	@@ -135,8 +136,6 @@
135	136	out = []
136	137	while sum(len(x) for x in out) < self.min_chunk*SAMPLING_RATE:
137	138	raw_bytes = self.connection.non_blocking_receive_audio()
138		- print(raw_bytes[:10])
139		- print(len(raw_bytes))
140	139	if not raw_bytes:
141	140	break
142	141	sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
...	...	@@ -167,7 +166,7 @@
167	166	print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
168	167	return "%1.0f %1.0f %s" % (beg,end,o[2])
169	168	else:
170		- print(o,file=sys.stderr,flush=True)
	169	+ # No text, so no output
171	170	return None
172	171
173	172	def send_result(self, o):
...	...	@@ -181,14 +180,13 @@
181	180	while True:
182	181	a = self.receive_audio_chunk()
183	182	if a is None:
184		- print("break here",file=sys.stderr)
185	183	break
186	184	self.online_asr_proc.insert_audio_chunk(a)
187	185	o = online.process_iter()
188	186	try:
189	187	self.send_result(o)
190	188	except BrokenPipeError:
191		- print("broken pipe -- connection closed?",file=sys.stderr)
	189	+ logging.info("broken pipe -- connection closed?")
192	190	break
193	191
194	192	# o = online.finish() # this should be working

...	...	@@ -4,6 +4,7 @@
4	4	import librosa
5	5	from functools import lru_cache
6	6	import time
	7	+import logging
7	8
8	9
9	10
...	...	@@ -57,7 +58,7 @@
57	58	from whisper_timestamped import transcribe_timestamped
58	59	self.transcribe_timestamped = transcribe_timestamped
59	60	if model_dir is not None:
60		- print("ignoring model_dir, not implemented",file=self.logfile)
	61	+ logging.debug("ignoring model_dir, not implemented")
61	62	return whisper.load_model(modelsize, download_root=cache_dir)
62	63
63	64	def transcribe(self, audio, init_prompt=""):
...	...	@@ -97,7 +98,7 @@
97	98	def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
98	99	from faster_whisper import WhisperModel
99	100	if model_dir is not None:
100		- print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.logfile)
	101	+ logging.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
101	102	model_size_or_path = model_dir
102	103	elif modelsize is not None:
103	104	model_size_or_path = modelsize
...	...	@@ -173,9 +174,11 @@
173	174	c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
174	175	tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
175	176	if c == tail:
176		- print("removing last",i,"words:",file=self.logfile)
	177	+ words = []
177	178	for j in range(i):
178		- print("\t",self.new.pop(0),file=self.logfile)
	179	+ words.append(repr(self.new.pop(0)))
	180	+ words_msg = "\t".join(words)
	181	+ logging.debug(f"removing last {i} words: {words_msg}")
179	182	break
180	183
181	184	def flush(self):
...	...	@@ -267,9 +270,9 @@
267	270	"""
268	271
269	272	prompt, non_prompt = self.prompt()
270		- print("PROMPT:", prompt, file=self.logfile)
271		- print("CONTEXT:", non_prompt, file=self.logfile)
272		- print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.logfile)
	273	+ logging.debug(f"PROMPT: {prompt}")
	274	+ logging.debug(f"CONTEXT: {non_prompt}")
	275	+ logging.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
273	276	res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
274	277
275	278	# transform to [(beg,end,"word1"), ...]
...	...	@@ -278,8 +281,10 @@
278	281	self.transcript_buffer.insert(tsw, self.buffer_time_offset)
279	282	o = self.transcript_buffer.flush()
280	283	self.commited.extend(o)
281		- print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.logfile,flush=True)
282		- print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
	284	+ completed = self.to_flush(o)
	285	+ logging.debug(f">>>>COMPLETE NOW: {completed}")
	286	+ the_rest = self.to_flush(self.transcript_buffer.complete())
	287	+ logging.debug(f"INCOMPLETE: {the_rest}")
283	288
284	289	# there is a newly confirmed text
285	290
...	...	@@ -303,18 +308,18 @@
303	308	#while k>0 and self.commited[k][1] > l:
304	309	# k -= 1
305	310	#t = self.commited[k][1]
306		- print(f"chunking segment",file=self.logfile)
	311	+ logging.debug(f"chunking segment")
307	312	#self.chunk_at(t)
308	313
309		- print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
	314	+ logging.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
310	315	return self.to_flush(o)
311	316
312	317	def chunk_completed_sentence(self):
313	318	if self.commited == []: return
314		- print(self.commited,file=self.logfile)
	319	+ logging.debug(self.commited)
315	320	sents = self.words_to_sentences(self.commited)
316	321	for s in sents:
317		- print("\t\tSENT:",s,file=self.logfile)
	322	+ logging.debug(f"\t\tSENT: {s}")
318	323	if len(sents) < 2:
319	324	return
320	325	while len(sents) > 2:
...	...	@@ -322,7 +327,7 @@
322	327	# we will continue with audio processing at this timestamp
323	328	chunk_at = sents[-2][1]
324	329
325		- print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.logfile)
	330	+ logging.debug(f"--- sentence chunked at {chunk_at:2.2f}")
326	331	self.chunk_at(chunk_at)
327	332
328	333	def chunk_completed_segment(self, res):
...	...	@@ -339,12 +344,12 @@
339	344	ends.pop(-1)
340	345	e = ends[-2]+self.buffer_time_offset
341	346	if e <= t:
342		- print(f"--- segment chunked at {e:2.2f}",file=self.logfile)
	347	+ logging.debug(f"--- segment chunked at {e:2.2f}")
343	348	self.chunk_at(e)
344	349	else:
345		- print(f"--- last segment not within commited area",file=self.logfile)
	350	+ logging.debug(f"--- last segment not within commited area")
346	351	else:
347		- print(f"--- not enough segments to chunk",file=self.logfile)
	352	+ logging.debug(f"--- not enough segments to chunk")
348	353
349	354
350	355
...	...	@@ -391,7 +396,7 @@
391	396	"""
392	397	o = self.transcript_buffer.complete()
393	398	f = self.to_flush(o)
394		- print("last, noncommited:",f,file=self.logfile)
	399	+ logging.debug("last, noncommited: {f}")
395	400	return f
396	401
397	402
...	...	@@ -431,7 +436,7 @@
431	436
432	437	# the following languages are in Whisper, but not in wtpsplit:
433	438	if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
434		- print(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.", file=sys.stderr)
	439	+ logging.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
435	440	lan = None
436	441
437	442	from wtpsplit import WtP
...	...	@@ -476,20 +481,20 @@
476	481	logfile = sys.stderr
477	482
478	483	if args.offline and args.comp_unaware:
479		- print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile)
	484	+ logging.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
480	485	sys.exit(1)
481	486
482	487	audio_path = args.audio_path
483	488
484	489	SAMPLING_RATE = 16000
485	490	duration = len(load_audio(audio_path))/SAMPLING_RATE
486		- print("Audio duration is: %2.2f seconds" % duration, file=logfile)
	491	+ logging.info("Audio duration is: %2.2f seconds" % duration)
487	492
488	493	size = args.model
489	494	language = args.lan
490	495
491	496	t = time.time()
492		- print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True)
	497	+ logging.info(f"Loading Whisper {size} model for {language}...")
493	498
494	499	if args.backend == "faster-whisper":
495	500	asr_cls = FasterWhisperASR
...	...	@@ -506,10 +511,10 @@
506	511
507	512
508	513	e = time.time()
509		- print(f"done. It took {round(e-t,2)} seconds.",file=logfile)
	514	+ logging.info(f"done. It took {round(e-t,2)} seconds.")
510	515
511	516	if args.vad:
512		- print("setting VAD filter",file=logfile)
	517	+ logging.info("setting VAD filter")
513	518	asr.use_vad()
514	519
515	520
...	...	@@ -543,16 +548,15 @@
543	548	print("%1.4f %1.0f %1.0f %s" % (now1000, o[0]1000,o[1]*1000,o[2]),file=logfile,flush=True)
544	549	print("%1.4f %1.0f %1.0f %s" % (now1000, o[0]1000,o[1]*1000,o[2]),flush=True)
545	550	else:
546		- print(o,file=logfile,flush=True)
	551	+ print("here?", o,file=logfile,flush=True)
547	552
548	553	if args.offline: ## offline mode processing (for testing/debugging)
549	554	a = load_audio(audio_path)
550	555	online.insert_audio_chunk(a)
551	556	try:
552	557	o = online.process_iter()
553		- except AssertionError:
554		- print("assertion error",file=logfile)
555		- pass
	558	+ except AssertionError as e:
	559	+ log.error(f"assertion error: {repr(e)}")
556	560	else:
557	561	output_transcript(o)
558	562	now = None
...	...	@@ -563,13 +567,13 @@
563	567	online.insert_audio_chunk(a)
564	568	try:
565	569	o = online.process_iter()
566		- except AssertionError:
567		- print("assertion error",file=logfile)
	570	+ except AssertionError as e:
	571	+ logging.error(f"assertion error: {repr(e)}")
568	572	pass
569	573	else:
570	574	output_transcript(o, now=end)
571	575
572		- print(f"## last processed {end:.2f}s",file=logfile,flush=True)
	576	+ logging.debug(f"## last processed {end:.2f}s")
573	577
574	578	if end >= duration:
575	579	break
...	...	@@ -595,13 +599,13 @@
595	599
596	600	try:
597	601	o = online.process_iter()
598		- except AssertionError:
599		- print("assertion error",file=logfile)
	602	+ except AssertionError as e:
	603	+ logging.error(f"assertion error: {e}")
600	604	pass
601	605	else:
602	606	output_transcript(o)
603	607	now = time.time() - start
604		- print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True)
	608	+ logging.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
605	609
606	610	if end >= duration:
607	611	break

Delete comment