Commit @0dd7ff9e880cc15763df36430448365b177cb3d3 - yjyoon/whisper_streaming

Dominik Macháček 2024-08-19

removing duplicated code -- whisper_online_vac

@0dd7ff9e880cc15763df36430448365b177cb3d3

2c075e3

0dd7ff9

voice_activity_controller.py

--- voice_activity_controller.py

+++ voice_activity_controller.py


             silence_in_wav)
 
         """
+        print("applying vad here")
         x = audio
         if not torch.is_tensor(x):
             try:

2c075e3

0dd7ff9

whisper_online.py

--- whisper_online.py

+++ whisper_online.py

...	...	@@ -517,6 +517,59 @@
517	517	e = offset + sents[-1][1]
518	518	return (b,e,t)
519	519
	520	+class VACOnlineASRProcessor(OnlineASRProcessor):
	521	+ '''Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
	522	+
	523	+ It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
	524	+ it runs VAD and continuously detects whether there is speech or not.
	525	+ When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
	526	+ '''
	527	+
	528	+ def __init__(self, online_chunk_size, a, *kw):
	529	+ self.online_chunk_size = online_chunk_size
	530	+
	531	+ self.online = OnlineASRProcessor(a, *kw)
	532	+ from voice_activity_controller import VoiceActivityController
	533	+ self.vac = VoiceActivityController(use_vad_result = False)
	534	+
	535	+ self.logfile = self.online.logfile
	536	+
	537	+ self.init()
	538	+
	539	+ def init(self):
	540	+ self.online.init()
	541	+ self.vac.reset_states()
	542	+ self.current_online_chunk_buffer_size = 0
	543	+ self.is_currently_final = False
	544	+
	545	+
	546	+ def insert_audio_chunk(self, audio):
	547	+ r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
	548	+ audio, is_final = r
	549	+ print(is_final)
	550	+ self.is_currently_final = is_final
	551	+ self.online.insert_audio_chunk(audio)
	552	+ self.current_online_chunk_buffer_size += len(audio)
	553	+
	554	+ def process_iter(self):
	555	+ if self.is_currently_final:
	556	+ return self.finish()
	557	+ elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE*self.online_chunk_size:
	558	+ self.current_online_chunk_buffer_size = 0
	559	+ ret = self.online.process_iter()
	560	+ return ret
	561	+ else:
	562	+ print("no online update, only VAD", file=self.logfile)
	563	+ return (None, None, "")
	564	+
	565	+ def finish(self):
	566	+ ret = self.online.finish()
	567	+ self.online.init(keep_offset=True)
	568	+ self.current_online_chunk_buffer_size = 0
	569	+ return ret
	570	+
	571	+
	572	+
520	573	WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",")
521	574
522	575	def create_tokenizer(lan):
...	...	@@ -561,6 +614,8 @@
561	614	parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.")
562	615	parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
563	616	parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.')
	617	+ parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
	618	+ parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
564	619	parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
565	620	parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
566	621	parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.')
...	...	@@ -607,7 +662,11 @@
607	662	tokenizer = None
608	663
609	664	# Create the OnlineASRProcessor
610		- online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
	665	+ if args.vac:
	666	+
	667	+ online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
	668	+ else:
	669	+ online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
611	670
612	671	return asr, online
613	672
...	...	@@ -652,7 +711,10 @@
652	711	logger.info("Audio duration is: %2.2f seconds" % duration)
653	712
654	713	asr, online = asr_factory(args, logfile=logfile)
655		- min_chunk = args.min_chunk_size
	714	+ if args.vac:
	715	+ min_chunk = args.vac_chunk_size
	716	+ else:
	717	+ min_chunk = args.min_chunk_size
656	718
657	719	# load the audio into the LRU cache before we start the timer
658	720	a = load_audio_chunk(audio_path,0,1)

2c075e3

0dd7ff9

whisper_online_server.py

--- whisper_online_server.py

+++ whisper_online_server.py


 # server options
 parser.add_argument("--host", type=str, default='localhost')
 parser.add_argument("--port", type=int, default=43007)
-parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
-parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
 parser.add_argument("--warmup-file", type=str, dest="warmup_file", 
         help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
 

             raw_bytes = self.connection.non_blocking_receive_audio()
             if not raw_bytes:
                 break
-            print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
+#            print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
             sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
             audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
             out.append(audio)

2c075e3

whisper_online_vac.py (deleted)

--- whisper_online_vac.py

...	...	@@ -1,203 +0,0 @@
	1	-from whisper_online import *
	2	-from voice_activity_controller import *
	3	-import soundfile
	4	-import io
	5	-
	6	-SAMPLING_RATE = 16000
	7	-
	8	-class VACOnlineASRProcessor(OnlineASRProcessor):
	9	-
	10	- def __init__(self, online_chunk_size, a, *kw):
	11	- self.online_chunk_size = online_chunk_size
	12	-
	13	- self.online = OnlineASRProcessor(a, *kw)
	14	- self.vac = VoiceActivityController(use_vad_result = False)
	15	-
	16	- self.logfile = self.online.logfile
	17	-
	18	- self.init()
	19	-
	20	- def init(self):
	21	- self.online.init()
	22	- self.vac.reset_states()
	23	- self.current_online_chunk_buffer_size = 0
	24	- self.is_currently_final = False
	25	-
	26	-
	27	- def insert_audio_chunk(self, audio):
	28	- r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
	29	- audio, is_final = r
	30	- print(is_final)
	31	- self.is_currently_final = is_final
	32	- self.online.insert_audio_chunk(audio)
	33	- self.current_online_chunk_buffer_size += len(audio)
	34	-
	35	- def process_iter(self):
	36	- if self.is_currently_final:
	37	- return self.finish()
	38	- elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size:
	39	- self.current_online_chunk_buffer_size = 0
	40	- ret = self.online.process_iter()
	41	- return ret
	42	- else:
	43	- print("no online update, only VAD", file=self.logfile)
	44	- return (None, None, "")
	45	-
	46	- def finish(self):
	47	- ret = self.online.finish()
	48	- self.online.init(keep_offset=True)
	49	- self.current_online_chunk_buffer_size = 0
	50	- return ret
	51	-
	52	-
	53	-
	54	-
	55	-if __name__ == "__main__":
	56	-
	57	- import argparse
	58	- parser = argparse.ArgumentParser()
	59	- parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
	60	- add_shared_args(parser)
	61	- parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
	62	- parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
	63	- parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
	64	- parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
	65	- args = parser.parse_args()
	66	-
	67	- # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
	68	- logfile = sys.stderr
	69	-
	70	- if args.offline and args.comp_unaware:
	71	- print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile)
	72	- sys.exit(1)
	73	-
	74	- audio_path = args.audio_path
	75	-
	76	- SAMPLING_RATE = 16000
	77	- duration = len(load_audio(audio_path))/SAMPLING_RATE
	78	- print("Audio duration is: %2.2f seconds" % duration, file=logfile)
	79	-
	80	- size = args.model
	81	- language = args.lan
	82	-
	83	- t = time.time()
	84	- print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True)
	85	-
	86	- if args.backend == "faster-whisper":
	87	- asr_cls = FasterWhisperASR
	88	- else:
	89	- asr_cls = WhisperTimestampedASR
	90	-
	91	- asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
	92	-
	93	- if args.task == "translate":
	94	- asr.set_translate_task()
	95	- tgt_language = "en" # Whisper translates into English
	96	- else:
	97	- tgt_language = language # Whisper transcribes in this language
	98	-
	99	-
	100	- e = time.time()
	101	- print(f"done. It took {round(e-t,2)} seconds.",file=logfile)
	102	-
	103	- if args.vad:
	104	- print("setting VAD filter",file=logfile)
	105	- asr.use_vad()
	106	-
	107	-
	108	- min_chunk = args.vac_chunk_size
	109	- if args.buffer_trimming == "sentence":
	110	- tokenizer = create_tokenizer(tgt_language)
	111	- else:
	112	- tokenizer = None
	113	- online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
	114	-
	115	-
	116	- # load the audio into the LRU cache before we start the timer
	117	- a = load_audio_chunk(audio_path,0,1)
	118	-
	119	- # warm up the ASR, because the very first transcribe takes much more time than the other
	120	- asr.transcribe(a)
	121	-
	122	- beg = args.start_at
	123	- start = time.time()-beg
	124	-
	125	- def output_transcript(o, now=None):
	126	- # output format in stdout is like:
	127	- # 4186.3606 0 1720 Takhle to je
	128	- # - the first three words are:
	129	- # - emission time from beginning of processing, in milliseconds
	130	- # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
	131	- # - the next words: segment transcript
	132	- if now is None:
	133	- now = time.time()-start
	134	- if o[0] is not None:
	135	- print("%1.4f %1.0f %1.0f %s" % (now1000, o[0]1000,o[1]*1000,o[2]),file=logfile,flush=True)
	136	- print("%1.4f %1.0f %1.0f %s" % (now1000, o[0]1000,o[1]*1000,o[2]),flush=True)
	137	- else:
	138	- print(o,file=logfile,flush=True)
	139	-
	140	- if args.offline: ## offline mode processing (for testing/debugging)
	141	- a = load_audio(audio_path)
	142	- online.insert_audio_chunk(a)
	143	- try:
	144	- o = online.process_iter()
	145	- except AssertionError:
	146	- print("assertion error",file=logfile)
	147	- pass
	148	- else:
	149	- output_transcript(o)
	150	- now = None
	151	- elif args.comp_unaware: # computational unaware mode
	152	- end = beg + min_chunk
	153	- while True:
	154	- a = load_audio_chunk(audio_path,beg,end)
	155	- online.insert_audio_chunk(a)
	156	- try:
	157	- o = online.process_iter()
	158	- except AssertionError:
	159	- print("assertion error",file=logfile)
	160	- pass
	161	- else:
	162	- output_transcript(o, now=end)
	163	-
	164	- print(f"## last processed {end:.2f}s",file=logfile,flush=True)
	165	-
	166	- if end >= duration:
	167	- break
	168	-
	169	- beg = end
	170	-
	171	- if end + min_chunk > duration:
	172	- end = duration
	173	- else:
	174	- end += min_chunk
	175	- now = duration
	176	-
	177	- else: # online = simultaneous mode
	178	- end = 0
	179	- while True:
	180	- now = time.time() - start
	181	- if now < end+min_chunk:
	182	- time.sleep(min_chunk+end-now)
	183	- end = time.time() - start
	184	- a = load_audio_chunk(audio_path,beg,end)
	185	- beg = end
	186	- online.insert_audio_chunk(a)
	187	-
	188	- try:
	189	- o = online.process_iter()
	190	- except AssertionError:
	191	- print("assertion error",file=logfile)
	192	- pass
	193	- else:
	194	- output_transcript(o)
	195	- now = time.time() - start
	196	- print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True)
	197	-
	198	- if end >= duration:
	199	- break
	200	- now = None
	201	-
	202	- o = online.finish()
	203	- output_transcript(o, now=now)

Add a comment

Open 0
Closed 0

List

...	...	@@ -48,6 +48,7 @@
48	48	silence_in_wav)
49	49
50	50	"""
	51	+ print("applying vad here")
51	52	x = audio
52	53	if not torch.is_tensor(x):
53	54	try:

...	...	@@ -13,8 +13,6 @@
13	13	# server options
14	14	parser.add_argument("--host", type=str, default='localhost')
15	15	parser.add_argument("--port", type=int, default=43007)
16		-parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
17		-parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
18	16	parser.add_argument("--warmup-file", type=str, dest="warmup_file",
19	17	help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
20	18
...	...	@@ -108,7 +106,7 @@
108	106	raw_bytes = self.connection.non_blocking_receive_audio()
109	107	if not raw_bytes:
110	108	break
111		- print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
	109	+# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
112	110	sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
113	111	audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
114	112	out.append(audio)

Delete comment