Commit @3cc6220a7e39424e2d55fd34b0d86eede848ab51 - yjyoon/whisper_streaming

Rodrigo 2023-12-07

use of silero model instead of silero VadIterator

@3cc6220a7e39424e2d55fd34b0d86eede848ab51

70c5355

3cc6220

mic_test_whisper_simple.py

--- mic_test_whisper_simple.py

+++ mic_test_whisper_simple.py


             if chunk is not None:
                 sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
                 audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
-                # self.audio_buffer.append(chunk)
                 out = []
                 out.append(audio)
                 a = np.concatenate(out)

 
             if is_final and len(self.audio_buffer) > 0:
                 res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
-                # use custom ts_words
                 tsw = self.ts_words(res)
+                
                 self.init_prompt = self.init_prompt + tsw
                 self.init_prompt  = self.init_prompt [-100:]
                 self.audio_buffer.resize(0)
                 iter_in_phrase =0
+                
                 yield True, tsw
-            # show progress evry 10 chunks
-            elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
+            # show progress evry 50 chunks
+            elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
                 res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
                 # use custom ts_words
                 tsw = self.ts_words(res)

70c5355

3cc6220

mic_test_whisper_streaming.py

--- mic_test_whisper_streaming.py

+++ mic_test_whisper_streaming.py


 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
 use_vad_result = True
-min_sample_length = 1 * SAMPLING_RATE
+min_sample_length = 1.5 * SAMPLING_RATE
 
 
 

70c5355

3cc6220

microphone_stream.py

--- microphone_stream.py

+++ microphone_stream.py


         self._pyaudio = pyaudio.PyAudio()
         self.sample_rate = sample_rate
 
-        self._chunk_size = int(self.sample_rate * 0.1)
+        self._chunk_size = int(self.sample_rate * 40  / 1000)
         self._stream = self._pyaudio.open(
             format=pyaudio.paInt16,
             channels=1,

70c5355

3cc6220

voice_activity_controller.py

--- voice_activity_controller.py

+++ voice_activity_controller.py


 # import sounddevice as sd
 import torch
 import numpy as np
+import datetime
 
+
+def int2float(sound):
+    abs_max = np.abs(sound).max()
+    sound = sound.astype('float32')
+    if abs_max > 0:
+        sound *= 1/32768
+    sound = sound.squeeze()  # depends on the use case
+    return sound
 
 class VoiceActivityController:
     def __init__(
             self, 
             sampling_rate = 16000,
-            second_ofSilence = 0.5,
-            second_ofSpeech = 0.25,
+            min_silence_to_final_ms = 500,
+            min_speech_to_final_ms = 100,
+            min_silence_duration_ms = 100,
             use_vad_result = True,
             activity_detected_callback=None,
+            threshold =0.3
         ):
         self.activity_detected_callback=activity_detected_callback
         self.model, self.utils = torch.hub.load(

         collect_chunks) = self.utils
 
         self.sampling_rate = sampling_rate  
-        self.silence_limit = second_ofSilence * self.sampling_rate 
-        self.speech_limit = second_ofSpeech *self.sampling_rate 
+        self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000 
+        self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
+        self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
 
         self.use_vad_result = use_vad_result
-        self.vad_iterator = VADIterator(
-            model =self.model,
-            threshold = 0.3, # 0.5
-            sampling_rate= self.sampling_rate,
-            min_silence_duration_ms = 500, #100
-            speech_pad_ms = 400 #30
-        )
         self.last_marked_chunk = None
-        
-    
-    def int2float(self, sound):
-        abs_max = np.abs(sound).max()
-        sound = sound.astype('float32')
-        if abs_max > 0:
-            sound *= 1/32768
-        sound = sound.squeeze()  # depends on the use case
-        return sound
+        self.threshold = threshold
+        self.reset_states()
+
+    def reset_states(self):
+        self.model.reset_states()
+        self.temp_end = 0
+        self.current_sample = 0
 
     def apply_vad(self, audio):
-        audio_float32 = self.int2float(audio)
-        chunk = self.vad_iterator(audio_float32, return_seconds=False)
+        x = int2float(audio)
+        if not torch.is_tensor(x):
+            try:
+                x = torch.Tensor(x)
+            except:
+                raise TypeError("Audio cannot be casted to tensor. Cast it manually")
 
-        if chunk is not None:        
-            if "start" in chunk:
-                start = chunk["start"]
-                self.last_marked_chunk = chunk
-                return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
-            
-            if "end" in chunk:
-                #todo: pending get the padding from the next chunk
-                end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
-                self.last_marked_chunk = chunk
-                return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
+        speech_prob = self.model(x, self.sampling_rate).item()
+        
+        window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
+        self.current_sample += window_size_samples 
 
-        if self.last_marked_chunk is not None:
-            if "start" in self.last_marked_chunk:
-                return audio, len(audio)  ,0
 
-            if "end" in self.last_marked_chunk:
-                return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio) 
+        if (speech_prob >= self.threshold):
+            self.temp_end = 0
+            return audio, window_size_samples, 0
 
-        return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0 
+        else :
+            if not self.temp_end:
+                self.temp_end = self.current_sample
+
+            if self.current_sample - self.temp_end < self.min_silence_samples:
+                return audio, 0, window_size_samples
+            else:
+                return np.array([], dtype=np.float16) , 0, window_size_samples
+
+
 
 
 
     def detect_user_speech(self, audio_stream, audio_in_int16 = False):
-        silence_len= 0
+        last_silence_len= 0
         speech_len = 0
 
         for data in audio_stream:  # replace with your condition of choice
-            # if isinstance(data, EndOfTransmission):
-            #     raise EndOfTransmission("End of transmission detected")
             
             
             audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
             wav = audio_block
             
-
             is_final = False
-            voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
-            # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
+            voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
+
 
             if speech_in_wav > 0 :
-                silence_len= 0                
+                last_silence_len= 0                
                 speech_len += speech_in_wav
                 if self.activity_detected_callback is not None:
                     self.activity_detected_callback()
 
-            silence_len = silence_len + last_silent_duration_in_wav
-            if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
+            last_silence_len +=  last_silent_in_wav
+            if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
+
                 is_final = True
-                silence_len= 0
-                speech_len = 0
-            
+                last_silence_len= 0
+                speech_len = 0                
 
             yield voice_audio.tobytes(), is_final
 

70c5355

3cc6220

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


 import librosa  
 from functools import lru_cache
 import time
-
+import datetime
 
 
 @lru_cache

         return model
 
     def transcribe(self, audio, init_prompt=""):
+
+        # tiempo_inicio = datetime.datetime.now()
         # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
         segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
+        
+        # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe  take { (datetime.datetime.now() -tiempo_inicio)  } ms.')
+
         return list(segments)
 
     def ts_words(self, segments):
         o = []
         for segment in segments:
             for word in segment.words:
+                if segment.no_speech_prob > 0.9:
+                    continue
                 # not stripping the spaces -- should not be merged with them!
                 w = word.word
                 t = (word.start, word.end, w)

Add a comment

Open 0
Closed 0

List

...	...	@@ -13,7 +13,7 @@
13	13	src_lan = "en" # source language
14	14	tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
15	15	use_vad_result = True
16		-min_sample_length = 1 * SAMPLING_RATE
	16	+min_sample_length = 1.5 * SAMPLING_RATE
17	17
18	18
19	19

...	...	@@ -4,7 +4,7 @@
4	4	import librosa
5	5	from functools import lru_cache
6	6	import time
7		-
	7	+import datetime
8	8
9	9
10	10	@lru_cache
...	...	@@ -118,14 +118,21 @@
118	118	return model
119	119
120	120	def transcribe(self, audio, init_prompt=""):
	121	+
	122	+ # tiempo_inicio = datetime.datetime.now()
121	123	# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
122	124	segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
	125	+
	126	+ # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe take { (datetime.datetime.now() -tiempo_inicio) } ms.')
	127	+
123	128	return list(segments)
124	129
125	130	def ts_words(self, segments):
126	131	o = []
127	132	for segment in segments:
128	133	for word in segment.words:
	134	+ if segment.no_speech_prob > 0.9:
	135	+ continue
129	136	# not stripping the spaces -- should not be merged with them!
130	137	w = word.word
131	138	t = (word.start, word.end, w)

...	...	@@ -39,7 +39,6 @@
39	39	if chunk is not None:
40	40	sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
41	41	audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
42		- # self.audio_buffer.append(chunk)
43	42	out = []
44	43	out.append(audio)
45	44	a = np.concatenate(out)
...	...	@@ -47,15 +46,16 @@
47	46
48	47	if is_final and len(self.audio_buffer) > 0:
49	48	res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
50		- # use custom ts_words
51	49	tsw = self.ts_words(res)
	50	+
52	51	self.init_prompt = self.init_prompt + tsw
53	52	self.init_prompt = self.init_prompt [-100:]
54	53	self.audio_buffer.resize(0)
55	54	iter_in_phrase =0
	55	+
56	56	yield True, tsw
57		- # show progress evry 10 chunks
58		- elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
	57	+ # show progress evry 50 chunks
	58	+ elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
59	59	res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
60	60	# use custom ts_words
61	61	tsw = self.ts_words(res)

...	...	@@ -29,7 +29,7 @@
29	29	self._pyaudio = pyaudio.PyAudio()
30	30	self.sample_rate = sample_rate
31	31
32		- self._chunk_size = int(self.sample_rate * 0.1)
	32	+ self._chunk_size = int(self.sample_rate * 40 / 1000)
33	33	self._stream = self._pyaudio.open(
34	34	format=pyaudio.paInt16,
35	35	channels=1,

...	...	@@ -3,16 +3,27 @@
3	3	# import sounddevice as sd
4	4	import torch
5	5	import numpy as np
	6	+import datetime
6	7
	8	+
	9	+def int2float(sound):
	10	+ abs_max = np.abs(sound).max()
	11	+ sound = sound.astype('float32')
	12	+ if abs_max > 0:
	13	+ sound *= 1/32768
	14	+ sound = sound.squeeze() # depends on the use case
	15	+ return sound
7	16
8	17	class VoiceActivityController:
9	18	def __init__(
10	19	self,
11	20	sampling_rate = 16000,
12		- second_ofSilence = 0.5,
13		- second_ofSpeech = 0.25,
	21	+ min_silence_to_final_ms = 500,
	22	+ min_speech_to_final_ms = 100,
	23	+ min_silence_duration_ms = 100,
14	24	use_vad_result = True,
15	25	activity_detected_callback=None,
	26	+ threshold =0.3
16	27	):
17	28	self.activity_detected_callback=activity_detected_callback
18	29	self.model, self.utils = torch.hub.load(
...	...	@@ -26,84 +37,77 @@
26	37	collect_chunks) = self.utils
27	38
28	39	self.sampling_rate = sampling_rate
29		- self.silence_limit = second_ofSilence * self.sampling_rate
30		- self.speech_limit = second_ofSpeech *self.sampling_rate
	40	+ self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000
	41	+ self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
	42	+ self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
31	43
32	44	self.use_vad_result = use_vad_result
33		- self.vad_iterator = VADIterator(
34		- model =self.model,
35		- threshold = 0.3, # 0.5
36		- sampling_rate= self.sampling_rate,
37		- min_silence_duration_ms = 500, #100
38		- speech_pad_ms = 400 #30
39		- )
40	45	self.last_marked_chunk = None
41		-
42		-
43		- def int2float(self, sound):
44		- abs_max = np.abs(sound).max()
45		- sound = sound.astype('float32')
46		- if abs_max > 0:
47		- sound *= 1/32768
48		- sound = sound.squeeze() # depends on the use case
49		- return sound
	46	+ self.threshold = threshold
	47	+ self.reset_states()
	48	+
	49	+ def reset_states(self):
	50	+ self.model.reset_states()
	51	+ self.temp_end = 0
	52	+ self.current_sample = 0
50	53
51	54	def apply_vad(self, audio):
52		- audio_float32 = self.int2float(audio)
53		- chunk = self.vad_iterator(audio_float32, return_seconds=False)
	55	+ x = int2float(audio)
	56	+ if not torch.is_tensor(x):
	57	+ try:
	58	+ x = torch.Tensor(x)
	59	+ except:
	60	+ raise TypeError("Audio cannot be casted to tensor. Cast it manually")
54	61
55		- if chunk is not None:
56		- if "start" in chunk:
57		- start = chunk["start"]
58		- self.last_marked_chunk = chunk
59		- return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
60		-
61		- if "end" in chunk:
62		- #todo: pending get the padding from the next chunk
63		- end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
64		- self.last_marked_chunk = chunk
65		- return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
	62	+ speech_prob = self.model(x, self.sampling_rate).item()
	63	+
	64	+ window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
	65	+ self.current_sample += window_size_samples
66	66
67		- if self.last_marked_chunk is not None:
68		- if "start" in self.last_marked_chunk:
69		- return audio, len(audio) ,0
70	67
71		- if "end" in self.last_marked_chunk:
72		- return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio)
	68	+ if (speech_prob >= self.threshold):
	69	+ self.temp_end = 0
	70	+ return audio, window_size_samples, 0
73	71
74		- return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0
	72	+ else :
	73	+ if not self.temp_end:
	74	+ self.temp_end = self.current_sample
	75	+
	76	+ if self.current_sample - self.temp_end < self.min_silence_samples:
	77	+ return audio, 0, window_size_samples
	78	+ else:
	79	+ return np.array([], dtype=np.float16) , 0, window_size_samples
	80	+
	81	+
75	82
76	83
77	84
78	85	def detect_user_speech(self, audio_stream, audio_in_int16 = False):
79		- silence_len= 0
	86	+ last_silence_len= 0
80	87	speech_len = 0
81	88
82	89	for data in audio_stream: # replace with your condition of choice
83		- # if isinstance(data, EndOfTransmission):
84		- # raise EndOfTransmission("End of transmission detected")
85	90
86	91
87	92	audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
88	93	wav = audio_block
89	94
90		-
91	95	is_final = False
92		- voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
93		- # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
	96	+ voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
	97	+
94	98
95	99	if speech_in_wav > 0 :
96		- silence_len= 0
	100	+ last_silence_len= 0
97	101	speech_len += speech_in_wav
98	102	if self.activity_detected_callback is not None:
99	103	self.activity_detected_callback()
100	104
101		- silence_len = silence_len + last_silent_duration_in_wav
102		- if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
	105	+ last_silence_len += last_silent_in_wav
	106	+ if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
	107	+
103	108	is_final = True
104		- silence_len= 0
105		- speech_len = 0
106		-
	109	+ last_silence_len= 0
	110	+ speech_len = 0
107	111
108	112	yield voice_audio.tobytes(), is_final
109	113

Delete comment