Commit @ebdde208f3efb1232f7c5ab2421944d5c3e4ec54 - yjyoon/whisper_streaming

Alex Young 2024-04-18

Construct an explicit logger rather than using the root logger

@ebdde208f3efb1232f7c5ab2421944d5c3e4ec54

3e02232

ebdde20

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


 import soundfile as sf
 import math
 
+logger = logging.getLogger(__name__)
+
 @lru_cache
 def load_audio(fname):
     a, _ = librosa.load(fname, sr=16000, dtype=np.float32)

         from whisper_timestamped import transcribe_timestamped
         self.transcribe_timestamped = transcribe_timestamped
         if model_dir is not None:
-            logging.debug("ignoring model_dir, not implemented")
+            logger.debug("ignoring model_dir, not implemented")
         return whisper.load_model(modelsize, download_root=cache_dir)
 
     def transcribe(self, audio, init_prompt=""):

         from faster_whisper import WhisperModel
         logging.getLogger("faster_whisper").setLevel(logging.WARNING)
         if model_dir is not None:
-            logging.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
+            logger.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
             model_size_or_path = model_dir
         elif modelsize is not None:
             model_size_or_path = modelsize

 
         # Process transcription/translation
         transcript = proc.create(**params)
-        logging.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
+        logger.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
 
         return transcript
 

                             for j in range(i):
                                 words.append(repr(self.new.pop(0)))
                             words_msg = "\t".join(words)
-                            logging.debug(f"removing last {i} words: {words_msg}")
+                            logger.debug(f"removing last {i} words: {words_msg}")
                             break
 
     def flush(self):

         """
 
         prompt, non_prompt = self.prompt()
-        logging.debug(f"PROMPT: {prompt}")
-        logging.debug(f"CONTEXT: {non_prompt}")
-        logging.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
+        logger.debug(f"PROMPT: {prompt}")
+        logger.debug(f"CONTEXT: {non_prompt}")
+        logger.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
         res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
 
         # transform to [(beg,end,"word1"), ...]

         o = self.transcript_buffer.flush()
         self.commited.extend(o)
         completed = self.to_flush(o)
-        logging.debug(f">>>>COMPLETE NOW: {completed}")
+        logger.debug(f">>>>COMPLETE NOW: {completed}")
         the_rest = self.to_flush(self.transcript_buffer.complete())
-        logging.debug(f"INCOMPLETE: {the_rest}")
+        logger.debug(f"INCOMPLETE: {the_rest}")
 
         # there is a newly confirmed text
 

             #while k>0 and self.commited[k][1] > l:
             #    k -= 1
             #t = self.commited[k][1] 
-            logging.debug(f"chunking segment")
+            logger.debug(f"chunking segment")
             #self.chunk_at(t)
 
-        logging.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
+        logger.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
         return self.to_flush(o)
 
     def chunk_completed_sentence(self):
         if self.commited == []: return
-        logging.debug(self.commited)
+        logger.debug(self.commited)
         sents = self.words_to_sentences(self.commited)
         for s in sents:
-            logging.debug(f"\t\tSENT: {s}")
+            logger.debug(f"\t\tSENT: {s}")
         if len(sents) < 2:
             return
         while len(sents) > 2:

         # we will continue with audio processing at this timestamp
         chunk_at = sents[-2][1]
 
-        logging.debug(f"--- sentence chunked at {chunk_at:2.2f}")
+        logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
         self.chunk_at(chunk_at)
 
     def chunk_completed_segment(self, res):

                 ends.pop(-1)
                 e = ends[-2]+self.buffer_time_offset
             if e <= t:
-                logging.debug(f"--- segment chunked at {e:2.2f}")
+                logger.debug(f"--- segment chunked at {e:2.2f}")
                 self.chunk_at(e)
             else:
-                logging.debug(f"--- last segment not within commited area")
+                logger.debug(f"--- last segment not within commited area")
         else:
-            logging.debug(f"--- not enough segments to chunk")
+            logger.debug(f"--- not enough segments to chunk")
 
 
 

         """
         o = self.transcript_buffer.complete()
         f = self.to_flush(o)
-        logging.debug("last, noncommited: {f}")
+        logger.debug("last, noncommited: {f}")
         return f
 
 

 
     # the following languages are in Whisper, but not in wtpsplit:
     if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
-        logging.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
+        logger.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
         lan = None
 
     from wtpsplit import WtP

     """
     backend = args.backend
     if backend == "openai-api":
-        logging.debug("Using OpenAI API.")
+        logger.debug("Using OpenAI API.")
         asr = OpenaiApiASR(lan=args.lan)
     else:
         if backend == "faster-whisper":

         # Only for FasterWhisperASR and WhisperTimestampedASR
         size = args.model
         t = time.time()
-        logging.debug(f"Loading Whisper {size} model for {args.lan}...")
+        logger.debug(f"Loading Whisper {size} model for {args.lan}...")
         asr = asr_cls(modelsize=size, lan=args.lan, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
         e = time.time()
-        logging.debug(f"done. It took {round(e-t,2)} seconds.")
+        logger.debug(f"done. It took {round(e-t,2)} seconds.")
 
     # Apply common configurations
     if getattr(args, 'vad', False):  # Checks if VAD argument is present and True
-        logging.info("Setting VAD filter")
+        logger.info("Setting VAD filter")
         asr.use_vad()
 
     language = args.lan

     logfile = sys.stderr
 
     if args.offline and args.comp_unaware:
-        logging.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
+        logger.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
         sys.exit(1)
 
     audio_path = args.audio_path
 
     SAMPLING_RATE = 16000
     duration = len(load_audio(audio_path))/SAMPLING_RATE
-    logging.info("Audio duration is: %2.2f seconds" % duration)
+    logger.info("Audio duration is: %2.2f seconds" % duration)
 
     asr, online = asr_factory(args, logfile=logfile)
     min_chunk = args.min_chunk_size

             try:
                 o = online.process_iter()
             except AssertionError as e:
-                logging.error(f"assertion error: {repr(e)}")
+                logger.error(f"assertion error: {repr(e)}")
                 pass
             else:
                 output_transcript(o, now=end)
 
-            logging.debug(f"## last processed {end:.2f}s")
+            logger.debug(f"## last processed {end:.2f}s")
 
             if end >= duration:
                 break

             try:
                 o = online.process_iter()
             except AssertionError as e:
-                logging.error(f"assertion error: {e}")
+                logger.error(f"assertion error: {e}")
                 pass
             else:
                 output_transcript(o)
             now = time.time() - start
-            logging.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
+            logger.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
 
             if end >= duration:
                 break

3e02232

ebdde20

whisper_online_server.py

--- whisper_online_server.py

+++ whisper_online_server.py


 import logging
 import numpy as np
 
+logger = logging.getLogger(__name__)
+print(__name__)
 parser = argparse.ArgumentParser()
 
 # server options

 language = args.lan
 asr, online = asr_factory(args)
 min_chunk = args.min_chunk_size
-
-
-if args.buffer_trimming == "sentence":
-    tokenizer = create_tokenizer(tgt_language)
-else:
-    tokenizer = None
-online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
 
 # warm up the ASR because the very first transcribe takes more time than the others. 
 # Test results in https://github.com/ufal/whisper_streaming/pull/81

             try:
                 self.send_result(o)
             except BrokenPipeError:
-                logging.info("broken pipe -- connection closed?")
+                logger.info("broken pipe -- connection closed?")
                 break
 
 #        o = online.finish()  # this should be working

     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     s.bind((args.host, args.port))
     s.listen(1)
-    logging.info('Listening on'+str((args.host, args.port)))
+    logger.info('Listening on'+str((args.host, args.port)))
     while True:
         conn, addr = s.accept()
-        logging.info('Connected to client on {}'.format(addr))
+        logger.info('Connected to client on {}'.format(addr))
         connection = Connection(conn)
         proc = ServerProcessor(connection, online, min_chunk)
         proc.process()
         conn.close()
-        logging.info('Connection to client closed')
-logging.info('Connection closed, terminating.')
+        logger.info('Connection to client closed')
+logger.info('Connection closed, terminating.')

Add a comment

Open 0
Closed 0

List

...	...	@@ -7,6 +7,8 @@
7	7	import logging
8	8	import numpy as np
9	9
	10	+logger = logging.getLogger(__name__)
	11	+print(__name__)
10	12	parser = argparse.ArgumentParser()
11	13
12	14	# server options
...	...	@@ -37,13 +39,6 @@
37	39	language = args.lan
38	40	asr, online = asr_factory(args)
39	41	min_chunk = args.min_chunk_size
40		-
41		-
42		-if args.buffer_trimming == "sentence":
43		- tokenizer = create_tokenizer(tgt_language)
44		-else:
45		- tokenizer = None
46		-online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
47	42
48	43	# warm up the ASR because the very first transcribe takes more time than the others.
49	44	# Test results in https://github.com/ufal/whisper_streaming/pull/81
...	...	@@ -161,7 +156,7 @@
161	156	try:
162	157	self.send_result(o)
163	158	except BrokenPipeError:
164		- logging.info("broken pipe -- connection closed?")
	159	+ logger.info("broken pipe -- connection closed?")
165	160	break
166	161
167	162	# o = online.finish() # this should be working
...	...	@@ -175,13 +170,13 @@
175	170	s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
176	171	s.bind((args.host, args.port))
177	172	s.listen(1)
178		- logging.info('Listening on'+str((args.host, args.port)))
	173	+ logger.info('Listening on'+str((args.host, args.port)))
179	174	while True:
180	175	conn, addr = s.accept()
181		- logging.info('Connected to client on {}'.format(addr))
	176	+ logger.info('Connected to client on {}'.format(addr))
182	177	connection = Connection(conn)
183	178	proc = ServerProcessor(connection, online, min_chunk)
184	179	proc.process()
185	180	conn.close()
186		- logging.info('Connection to client closed')
187		-logging.info('Connection closed, terminating.')
	181	+ logger.info('Connection to client closed')
	182	+logger.info('Connection closed, terminating.')

...	...	@@ -11,6 +11,8 @@
11	11	import soundfile as sf
12	12	import math
13	13
	14	+logger = logging.getLogger(__name__)
	15	+
14	16	@lru_cache
15	17	def load_audio(fname):
16	18	a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
...	...	@@ -65,7 +67,7 @@
65	67	from whisper_timestamped import transcribe_timestamped
66	68	self.transcribe_timestamped = transcribe_timestamped
67	69	if model_dir is not None:
68		- logging.debug("ignoring model_dir, not implemented")
	70	+ logger.debug("ignoring model_dir, not implemented")
69	71	return whisper.load_model(modelsize, download_root=cache_dir)
70	72
71	73	def transcribe(self, audio, init_prompt=""):
...	...	@@ -106,7 +108,7 @@
106	108	from faster_whisper import WhisperModel
107	109	logging.getLogger("faster_whisper").setLevel(logging.WARNING)
108	110	if model_dir is not None:
109		- logging.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
	111	+ logger.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
110	112	model_size_or_path = model_dir
111	113	elif modelsize is not None:
112	114	model_size_or_path = modelsize
...	...	@@ -229,7 +231,7 @@
229	231
230	232	# Process transcription/translation
231	233	transcript = proc.create(**params)
232		- logging.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
	234	+ logger.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
233	235
234	236	return transcript
235	237
...	...	@@ -276,7 +278,7 @@
276	278	for j in range(i):
277	279	words.append(repr(self.new.pop(0)))
278	280	words_msg = "\t".join(words)
279		- logging.debug(f"removing last {i} words: {words_msg}")
	281	+ logger.debug(f"removing last {i} words: {words_msg}")
280	282	break
281	283
282	284	def flush(self):
...	...	@@ -365,9 +367,9 @@
365	367	"""
366	368
367	369	prompt, non_prompt = self.prompt()
368		- logging.debug(f"PROMPT: {prompt}")
369		- logging.debug(f"CONTEXT: {non_prompt}")
370		- logging.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
	370	+ logger.debug(f"PROMPT: {prompt}")
	371	+ logger.debug(f"CONTEXT: {non_prompt}")
	372	+ logger.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
371	373	res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
372	374
373	375	# transform to [(beg,end,"word1"), ...]
...	...	@@ -377,9 +379,9 @@
377	379	o = self.transcript_buffer.flush()
378	380	self.commited.extend(o)
379	381	completed = self.to_flush(o)
380		- logging.debug(f">>>>COMPLETE NOW: {completed}")
	382	+ logger.debug(f">>>>COMPLETE NOW: {completed}")
381	383	the_rest = self.to_flush(self.transcript_buffer.complete())
382		- logging.debug(f"INCOMPLETE: {the_rest}")
	384	+ logger.debug(f"INCOMPLETE: {the_rest}")
383	385
384	386	# there is a newly confirmed text
385	387
...	...	@@ -403,18 +405,18 @@
403	405	#while k>0 and self.commited[k][1] > l:
404	406	# k -= 1
405	407	#t = self.commited[k][1]
406		- logging.debug(f"chunking segment")
	408	+ logger.debug(f"chunking segment")
407	409	#self.chunk_at(t)
408	410
409		- logging.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
	411	+ logger.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
410	412	return self.to_flush(o)
411	413
412	414	def chunk_completed_sentence(self):
413	415	if self.commited == []: return
414		- logging.debug(self.commited)
	416	+ logger.debug(self.commited)
415	417	sents = self.words_to_sentences(self.commited)
416	418	for s in sents:
417		- logging.debug(f"\t\tSENT: {s}")
	419	+ logger.debug(f"\t\tSENT: {s}")
418	420	if len(sents) < 2:
419	421	return
420	422	while len(sents) > 2:
...	...	@@ -422,7 +424,7 @@
422	424	# we will continue with audio processing at this timestamp
423	425	chunk_at = sents[-2][1]
424	426
425		- logging.debug(f"--- sentence chunked at {chunk_at:2.2f}")
	427	+ logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
426	428	self.chunk_at(chunk_at)
427	429
428	430	def chunk_completed_segment(self, res):
...	...	@@ -439,12 +441,12 @@
439	441	ends.pop(-1)
440	442	e = ends[-2]+self.buffer_time_offset
441	443	if e <= t:
442		- logging.debug(f"--- segment chunked at {e:2.2f}")
	444	+ logger.debug(f"--- segment chunked at {e:2.2f}")
443	445	self.chunk_at(e)
444	446	else:
445		- logging.debug(f"--- last segment not within commited area")
	447	+ logger.debug(f"--- last segment not within commited area")
446	448	else:
447		- logging.debug(f"--- not enough segments to chunk")
	449	+ logger.debug(f"--- not enough segments to chunk")
448	450
449	451
450	452
...	...	@@ -490,7 +492,7 @@
490	492	"""
491	493	o = self.transcript_buffer.complete()
492	494	f = self.to_flush(o)
493		- logging.debug("last, noncommited: {f}")
	495	+ logger.debug("last, noncommited: {f}")
494	496	return f
495	497
496	498
...	...	@@ -530,7 +532,7 @@
530	532
531	533	# the following languages are in Whisper, but not in wtpsplit:
532	534	if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
533		- logging.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
	535	+ logger.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
534	536	lan = None
535	537
536	538	from wtpsplit import WtP
...	...	@@ -563,7 +565,7 @@
563	565	"""
564	566	backend = args.backend
565	567	if backend == "openai-api":
566		- logging.debug("Using OpenAI API.")
	568	+ logger.debug("Using OpenAI API.")
567	569	asr = OpenaiApiASR(lan=args.lan)
568	570	else:
569	571	if backend == "faster-whisper":
...	...	@@ -574,14 +576,14 @@
574	576	# Only for FasterWhisperASR and WhisperTimestampedASR
575	577	size = args.model
576	578	t = time.time()
577		- logging.debug(f"Loading Whisper {size} model for {args.lan}...")
	579	+ logger.debug(f"Loading Whisper {size} model for {args.lan}...")
578	580	asr = asr_cls(modelsize=size, lan=args.lan, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
579	581	e = time.time()
580		- logging.debug(f"done. It took {round(e-t,2)} seconds.")
	582	+ logger.debug(f"done. It took {round(e-t,2)} seconds.")
581	583
582	584	# Apply common configurations
583	585	if getattr(args, 'vad', False): # Checks if VAD argument is present and True
584		- logging.info("Setting VAD filter")
	586	+ logger.info("Setting VAD filter")
585	587	asr.use_vad()
586	588
587	589	language = args.lan
...	...	@@ -619,14 +621,14 @@
619	621	logfile = sys.stderr
620	622
621	623	if args.offline and args.comp_unaware:
622		- logging.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
	624	+ logger.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
623	625	sys.exit(1)
624	626
625	627	audio_path = args.audio_path
626	628
627	629	SAMPLING_RATE = 16000
628	630	duration = len(load_audio(audio_path))/SAMPLING_RATE
629		- logging.info("Audio duration is: %2.2f seconds" % duration)
	631	+ logger.info("Audio duration is: %2.2f seconds" % duration)
630	632
631	633	asr, online = asr_factory(args, logfile=logfile)
632	634	min_chunk = args.min_chunk_size
...	...	@@ -674,12 +676,12 @@
674	676	try:
675	677	o = online.process_iter()
676	678	except AssertionError as e:
677		- logging.error(f"assertion error: {repr(e)}")
	679	+ logger.error(f"assertion error: {repr(e)}")
678	680	pass
679	681	else:
680	682	output_transcript(o, now=end)
681	683
682		- logging.debug(f"## last processed {end:.2f}s")
	684	+ logger.debug(f"## last processed {end:.2f}s")
683	685
684	686	if end >= duration:
685	687	break
...	...	@@ -706,12 +708,12 @@
706	708	try:
707	709	o = online.process_iter()
708	710	except AssertionError as e:
709		- logging.error(f"assertion error: {e}")
	711	+ logger.error(f"assertion error: {e}")
710	712	pass
711	713	else:
712	714	output_transcript(o)
713	715	now = time.time() - start
714		- logging.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
	716	+ logger.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
715	717
716	718	if end >= duration:
717	719	break

Delete comment