Commit @dfc862b3bc2341b18b3ce7c78cb2d18e59744e29 - yjyoon/whisper_streaming

Dominik Macháček 2024-01-03

VAC controller integrated

it works. Reproducing #39

@dfc862b3bc2341b18b3ce7c78cb2d18e59744e29

ecee2fb

dfc862b

voice_activity_controller.py

--- voice_activity_controller.py

+++ voice_activity_controller.py


         self.temp_end = 0
         self.current_sample = 0
 
+        self.last_silence_len= 0
+        self.speech_len = 0
+
     def apply_vad(self, audio):
-        x = int2float(audio)
+#        x = int2float(audio)
+        x = audio
         if not torch.is_tensor(x):
             try:
                 x = torch.Tensor(x)

                 return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples
 
 
+    def detect_speech_iter(self, data, audio_in_int16 = False):
+#        audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
+        audio_block = data
+        wav = audio_block
+
+        print(wav, len(wav), type(wav), wav.dtype)
+        
+        is_final = False
+        voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
+
+
+        if speech_in_wav > 0 :
+            self.last_silence_len= 0                
+            self.speech_len += speech_in_wav
+#            if self.activity_detected_callback is not None:
+#                self.activity_detected_callback()
+
+        self.last_silence_len +=  last_silent_in_wav
+        if self.last_silence_len>= self.final_silence_limit and self.speech_len >= self.final_speech_limit:
+
+            is_final = True
+            self.last_silence_len= 0
+            self.speech_len = 0                
+
+#        return voice_audio.tobytes(), is_final
+        return voice_audio, is_final
 
 
 
     def detect_user_speech(self, audio_stream, audio_in_int16 = False):
-        last_silence_len= 0
-        speech_len = 0
+        self.last_silence_len= 0
+        self.speech_len = 0
 
         for data in audio_stream:  # replace with your condition of choice
-            
-            
-            audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
-            wav = audio_block
-            
-            is_final = False
-            voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
-
-
-            if speech_in_wav > 0 :
-                last_silence_len= 0                
-                speech_len += speech_in_wav
-                if self.activity_detected_callback is not None:
-                    self.activity_detected_callback()
-
-            last_silence_len +=  last_silent_in_wav
-            if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
-
-                is_final = True
-                last_silence_len= 0
-                speech_len = 0                
-
-            yield voice_audio.tobytes(), is_final
-
+            yield self.detect_speech_iter(data, audio_in_int16)
+           
 
 
 

dfc862b

whisper_online_vac.py (added)

+++ whisper_online_vac.py

...	...	@@ -0,0 +1,209 @@
	1	+from whisper_online import *
	2	+from voice_activity_controller import *
	3	+import soundfile
	4	+import io
	5	+
	6	+SAMPLING_RATE = 16000
	7	+
	8	+class VACOnlineASRProcessor(OnlineASRProcessor):
	9	+
	10	+ def __init__(self, a, *kw):
	11	+ self.online = OnlineASRProcessor(a, *kw)
	12	+ self.vac = VoiceActivityController(use_vad_result = True)
	13	+
	14	+ self.is_currently_final = False
	15	+ self.logfile = self.online.logfile
	16	+
	17	+ #self.vac_buffer = io.BytesIO()
	18	+ #self.vac_stream = self.vac.detect_user_speech(self.vac_buffer, audio_in_int16=False)
	19	+
	20	+ self.audio_log = open("audio_log.wav","wb")
	21	+
	22	+ def init(self):
	23	+ self.online.init()
	24	+ self.vac.reset_states()
	25	+
	26	+ def insert_audio_chunk(self, audio):
	27	+ print(audio, len(audio), type(audio), audio.dtype)
	28	+ r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
	29	+ raw_bytes, is_final = r
	30	+ print("is_final",is_final)
	31	+ print("raw_bytes", raw_bytes[:10], len(raw_bytes), type(raw_bytes))
	32	+# self.audio_log.write(raw_bytes)
	33	+ #sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	34	+ #audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
	35	+ audio = raw_bytes
	36	+ print("po překonvertování", audio, len(audio), type(audio), audio.dtype)
	37	+ self.is_currently_final = is_final
	38	+ self.online.insert_audio_chunk(audio)
	39	+# self.audio_log.write(audio)
	40	+ self.audio_log.flush()
	41	+
	42	+ print("inserted",file=self.logfile)
	43	+
	44	+ def process_iter(self):
	45	+ if self.is_currently_final:
	46	+ return self.finish()
	47	+ else:
	48	+ print(self.online.audio_buffer)
	49	+ ret = self.online.process_iter()
	50	+ print("tady",file=self.logfile)
	51	+ return ret
	52	+
	53	+ def finish(self):
	54	+ ret = self.online.finish()
	55	+ self.online.init()
	56	+ return ret
	57	+
	58	+
	59	+
	60	+
	61	+if __name__ == "__main__":
	62	+
	63	+ import argparse
	64	+ parser = argparse.ArgumentParser()
	65	+ parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
	66	+ add_shared_args(parser)
	67	+ parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
	68	+ parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
	69	+ parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
	70	+
	71	+ args = parser.parse_args()
	72	+
	73	+ # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
	74	+ logfile = sys.stderr
	75	+
	76	+ if args.offline and args.comp_unaware:
	77	+ print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile)
	78	+ sys.exit(1)
	79	+
	80	+ audio_path = args.audio_path
	81	+
	82	+ SAMPLING_RATE = 16000
	83	+ duration = len(load_audio(audio_path))/SAMPLING_RATE
	84	+ print("Audio duration is: %2.2f seconds" % duration, file=logfile)
	85	+
	86	+ size = args.model
	87	+ language = args.lan
	88	+
	89	+ t = time.time()
	90	+ print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True)
	91	+
	92	+ if args.backend == "faster-whisper":
	93	+ asr_cls = FasterWhisperASR
	94	+ else:
	95	+ asr_cls = WhisperTimestampedASR
	96	+
	97	+ asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
	98	+
	99	+ if args.task == "translate":
	100	+ asr.set_translate_task()
	101	+ tgt_language = "en" # Whisper translates into English
	102	+ else:
	103	+ tgt_language = language # Whisper transcribes in this language
	104	+
	105	+
	106	+ e = time.time()
	107	+ print(f"done. It took {round(e-t,2)} seconds.",file=logfile)
	108	+
	109	+ if args.vad:
	110	+ print("setting VAD filter",file=logfile)
	111	+ asr.use_vad()
	112	+
	113	+
	114	+ min_chunk = args.min_chunk_size
	115	+ if args.buffer_trimming == "sentence":
	116	+ tokenizer = create_tokenizer(tgt_language)
	117	+ else:
	118	+ tokenizer = None
	119	+ online = VACOnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
	120	+
	121	+
	122	+ # load the audio into the LRU cache before we start the timer
	123	+ a = load_audio_chunk(audio_path,0,1)
	124	+
	125	+ # warm up the ASR, because the very first transcribe takes much more time than the other
	126	+ asr.transcribe(a)
	127	+
	128	+ beg = args.start_at
	129	+ start = time.time()-beg
	130	+
	131	+ def output_transcript(o, now=None):
	132	+ # output format in stdout is like:
	133	+ # 4186.3606 0 1720 Takhle to je
	134	+ # - the first three words are:
	135	+ # - emission time from beginning of processing, in milliseconds
	136	+ # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
	137	+ # - the next words: segment transcript
	138	+ if now is None:
	139	+ now = time.time()-start
	140	+ if o[0] is not None:
	141	+ print("%1.4f %1.0f %1.0f %s" % (now1000, o[0]1000,o[1]*1000,o[2]),file=logfile,flush=True)
	142	+ print("%1.4f %1.0f %1.0f %s" % (now1000, o[0]1000,o[1]*1000,o[2]),flush=True)
	143	+ else:
	144	+ print(o,file=logfile,flush=True)
	145	+
	146	+ if args.offline: ## offline mode processing (for testing/debugging)
	147	+ a = load_audio(audio_path)
	148	+ online.insert_audio_chunk(a)
	149	+ try:
	150	+ o = online.process_iter()
	151	+ except AssertionError:
	152	+ print("assertion error",file=logfile)
	153	+ pass
	154	+ else:
	155	+ output_transcript(o)
	156	+ now = None
	157	+ elif args.comp_unaware: # computational unaware mode
	158	+ end = beg + min_chunk
	159	+ while True:
	160	+ a = load_audio_chunk(audio_path,beg,end)
	161	+ online.insert_audio_chunk(a)
	162	+ try:
	163	+ o = online.process_iter()
	164	+ except AssertionError:
	165	+ print("assertion error",file=logfile)
	166	+ pass
	167	+ else:
	168	+ output_transcript(o, now=end)
	169	+
	170	+ print(f"## last processed {end:.2f}s",file=logfile,flush=True)
	171	+
	172	+ if end >= duration:
	173	+ break
	174	+
	175	+ beg = end
	176	+
	177	+ if end + min_chunk > duration:
	178	+ end = duration
	179	+ else:
	180	+ end += min_chunk
	181	+ now = duration
	182	+
	183	+ else: # online = simultaneous mode
	184	+ end = 0
	185	+ while True:
	186	+ now = time.time() - start
	187	+ if now < end+min_chunk:
	188	+ time.sleep(min_chunk+end-now)
	189	+ end = time.time() - start
	190	+ a = load_audio_chunk(audio_path,beg,end)
	191	+ beg = end
	192	+ online.insert_audio_chunk(a)
	193	+
	194	+ try:
	195	+ o = online.process_iter()
	196	+ except AssertionError:
	197	+ print("assertion error",file=logfile)
	198	+ pass
	199	+ else:
	200	+ output_transcript(o)
	201	+ now = time.time() - start
	202	+ print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True)
	203	+
	204	+ if end >= duration:
	205	+ break
	206	+ now = None
	207	+
	208	+ o = online.finish()
	209	+ output_transcript(o, now=now)

Add a comment

Open 0
Closed 0

List

...	...	@@ -51,8 +51,12 @@
51	51	self.temp_end = 0
52	52	self.current_sample = 0
53	53
	54	+ self.last_silence_len= 0
	55	+ self.speech_len = 0
	56	+
54	57	def apply_vad(self, audio):
55		- x = int2float(audio)
	58	+# x = int2float(audio)
	59	+ x = audio
56	60	if not torch.is_tensor(x):
57	61	try:
58	62	x = torch.Tensor(x)
...	...	@@ -79,38 +83,42 @@
79	83	return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples
80	84
81	85
	86	+ def detect_speech_iter(self, data, audio_in_int16 = False):
	87	+# audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
	88	+ audio_block = data
	89	+ wav = audio_block
	90	+
	91	+ print(wav, len(wav), type(wav), wav.dtype)
	92	+
	93	+ is_final = False
	94	+ voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
	95	+
	96	+
	97	+ if speech_in_wav > 0 :
	98	+ self.last_silence_len= 0
	99	+ self.speech_len += speech_in_wav
	100	+# if self.activity_detected_callback is not None:
	101	+# self.activity_detected_callback()
	102	+
	103	+ self.last_silence_len += last_silent_in_wav
	104	+ if self.last_silence_len>= self.final_silence_limit and self.speech_len >= self.final_speech_limit:
	105	+
	106	+ is_final = True
	107	+ self.last_silence_len= 0
	108	+ self.speech_len = 0
	109	+
	110	+# return voice_audio.tobytes(), is_final
	111	+ return voice_audio, is_final
82	112
83	113
84	114
85	115	def detect_user_speech(self, audio_stream, audio_in_int16 = False):
86		- last_silence_len= 0
87		- speech_len = 0
	116	+ self.last_silence_len= 0
	117	+ self.speech_len = 0
88	118
89	119	for data in audio_stream: # replace with your condition of choice
90		-
91		-
92		- audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
93		- wav = audio_block
94		-
95		- is_final = False
96		- voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
97		-
98		-
99		- if speech_in_wav > 0 :
100		- last_silence_len= 0
101		- speech_len += speech_in_wav
102		- if self.activity_detected_callback is not None:
103		- self.activity_detected_callback()
104		-
105		- last_silence_len += last_silent_in_wav
106		- if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
107		-
108		- is_final = True
109		- last_silence_len= 0
110		- speech_len = 0
111		-
112		- yield voice_audio.tobytes(), is_final
113		-
	120	+ yield self.detect_speech_iter(data, audio_in_int16)
	121	+
114	122
115	123
116	124

Delete comment