Commit @c300a03e47eab86e5de54d816f55fb05c3a15d02 - yjyoon/whisper_streaming

Dominik Macháček 2024-08-19

clean code with VAC

@c300a03e47eab86e5de54d816f55fb05c3a15d02

c300a03

silero_vad.py (added)

+++ silero_vad.py

...	...	@@ -0,0 +1,95 @@
	1	+import torch
	2	+
	3	+# this is copypasted from silero-vad's vad_utils.py:
	4	+# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
	5	+
	6	+class VADIterator:
	7	+ def __init__(self,
	8	+ model,
	9	+ threshold: float = 0.5,
	10	+ sampling_rate: int = 16000,
	11	+ min_silence_duration_ms: int = 100,
	12	+ speech_pad_ms: int = 30
	13	+ ):
	14	+
	15	+ """
	16	+ Class for stream imitation
	17	+
	18	+ Parameters
	19	+ ----------
	20	+ model: preloaded .jit silero VAD model
	21	+
	22	+ threshold: float (default - 0.5)
	23	+ Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
	24	+ It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
	25	+
	26	+ sampling_rate: int (default - 16000)
	27	+ Currently silero VAD models support 8000 and 16000 sample rates
	28	+
	29	+ min_silence_duration_ms: int (default - 100 milliseconds)
	30	+ In the end of each speech chunk wait for min_silence_duration_ms before separating it
	31	+
	32	+ speech_pad_ms: int (default - 30 milliseconds)
	33	+ Final speech chunks are padded by speech_pad_ms each side
	34	+ """
	35	+
	36	+ self.model = model
	37	+ self.threshold = threshold
	38	+ self.sampling_rate = sampling_rate
	39	+
	40	+ if sampling_rate not in [8000, 16000]:
	41	+ raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
	42	+
	43	+ self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
	44	+ self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
	45	+ self.reset_states()
	46	+
	47	+ def reset_states(self):
	48	+
	49	+ self.model.reset_states()
	50	+ self.triggered = False
	51	+ self.temp_end = 0
	52	+ self.current_sample = 0
	53	+
	54	+ def __call__(self, x, return_seconds=False):
	55	+ """
	56	+ x: torch.Tensor
	57	+ audio chunk (see examples in repo)
	58	+
	59	+ return_seconds: bool (default - False)
	60	+ whether return timestamps in seconds (default - samples)
	61	+ """
	62	+
	63	+ if not torch.is_tensor(x):
	64	+ try:
	65	+ x = torch.Tensor(x)
	66	+ except:
	67	+ raise TypeError("Audio cannot be casted to tensor. Cast it manually")
	68	+
	69	+ window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
	70	+ self.current_sample += window_size_samples
	71	+
	72	+ speech_prob = self.model(x, self.sampling_rate).item()
	73	+
	74	+ if (speech_prob >= self.threshold) and self.temp_end:
	75	+ self.temp_end = 0
	76	+
	77	+ if (speech_prob >= self.threshold) and not self.triggered:
	78	+ self.triggered = True
	79	+ speech_start = self.current_sample - self.speech_pad_samples
	80	+ return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
	81	+
	82	+ if (speech_prob < self.threshold - 0.15) and self.triggered:
	83	+ if not self.temp_end:
	84	+ self.temp_end = self.current_sample
	85	+ if self.current_sample - self.temp_end < self.min_silence_samples:
	86	+ return None
	87	+ else:
	88	+ speech_end = self.temp_end + self.speech_pad_samples
	89	+ self.temp_end = 0
	90	+ self.triggered = False
	91	+ return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
	92	+
	93	+ return None
	94	+
	95	+

0dd7ff9

c300a03

voice_activity_controller.py

--- voice_activity_controller.py

+++ voice_activity_controller.py


 import torch
-import numpy as np
+from silero_vad import VADIterator
+import time
 
 class VoiceActivityController:
-    def __init__(
-            self, 
-            sampling_rate = 16000,
-            min_silence_to_final_ms = 500,
-            min_speech_to_final_ms = 100,
-            min_silence_duration_ms = 100,
-            use_vad_result = True,
-#            activity_detected_callback=None,
-            threshold =0.3
-        ):
-#        self.activity_detected_callback=activity_detected_callback
-        self.model, self.utils = torch.hub.load(
+    SAMPLING_RATE = 16000
+    def __init__(self):
+        self.model, _ = torch.hub.load(
             repo_or_dir='snakers4/silero-vad',
             model='silero_vad'
         )
-        # (self.get_speech_timestamps,
-        # save_audio,
-        # read_audio,
-        # VADIterator,
-        # collect_chunks) = self.utils
+        # we use the default options: 500ms silence, etc.
+        self.iterator = VADIterator(self.model)
 
-        self.sampling_rate = sampling_rate  
-        self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000 
-        self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
-        self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+    def reset(self):
+        self.iterator.reset_states()
 
-        self.use_vad_result = use_vad_result
-        self.threshold = threshold
-        self.reset_states()
-
-    def reset_states(self):
-        self.model.reset_states()
-        self.temp_end = 0
-        self.current_sample = 0
-
-        self.last_silence_len= 0
-        self.speech_len = 0
-
-    def apply_vad(self, audio):
-        """
-        returns: triple
-            (voice_audio,
-            speech_in_wav,
-            silence_in_wav)
-
-        """
-        print("applying vad here")
+    def __call__(self, audio):
+        '''
+        audio: audio chunk in the current np.array format
+        returns: 
+        - { 'start': time_frame } ... when voice start was detected. time_frame is number of frame (can be converted to seconds)
+        - { 'end': time_frame }   ... when voice end is detected
+        - None                    ... when no change detected by current chunk 
+        '''
         x = audio
-        if not torch.is_tensor(x):
-            try:
-                x = torch.Tensor(x)
-            except:
-                raise TypeError("Audio cannot be casted to tensor. Cast it manually")
-
-        speech_prob = self.model(x, self.sampling_rate).item()
-        print("speech_prob",speech_prob)
-        
-        window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
-        self.current_sample += window_size_samples 
-
-        if speech_prob >= self.threshold:  # speech is detected
-            self.temp_end = 0
-            return audio, window_size_samples, 0
-
-        else:  # silence detected, counting w
-            if not self.temp_end:
-                self.temp_end = self.current_sample
-
-            if self.current_sample - self.temp_end < self.min_silence_samples:
-                return audio, 0, window_size_samples
-            else:
-                return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples
-
-
-    def detect_speech_iter(self, data, audio_in_int16 = False):
-        audio_block = data
-        wav = audio_block
-
-        is_final = False
-        voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
-        print("speech, last silence",speech_in_wav, last_silent_in_wav)
-
-
-        if speech_in_wav > 0 :
-            self.last_silence_len= 0                
-            self.speech_len += speech_in_wav
-#            if self.activity_detected_callback is not None:
-#                self.activity_detected_callback()
-
-        self.last_silence_len +=  last_silent_in_wav
-        print("self.last_silence_len",self.last_silence_len, self.final_silence_limit,self.last_silence_len>= self.final_silence_limit)
-        print("self.speech_len, final_speech_limit",self.speech_len , self.final_speech_limit,self.speech_len >= self.final_speech_limit)
-        if self.last_silence_len>= self.final_silence_limit and self.speech_len >= self.final_speech_limit:
-            for i in range(10): print("TADY!!!")
-
-            is_final = True
-            self.last_silence_len= 0
-            self.speech_len = 0                
-
-        return voice_audio, is_final
-
-    def detect_user_speech(self, audio_stream, audio_in_int16 = False):
-        self.last_silence_len= 0
-        self.speech_len = 0
-
-        for data in audio_stream:  # replace with your condition of choice
-            yield self.detect_speech_iter(data, audio_in_int16)
+#        if not torch.is_tensor(x):
+#            try:
+#                x = torch.Tensor(x)
+#            except:
+#                raise TypeError("Audio cannot be casted to tensor. Cast it manually")
+        t = time.time()
+        a = self.iterator(x)
+        print("VAD took ",time.time()-t,"seconds")
+        return a

0dd7ff9

c300a03

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


 
         self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
 
-    def init(self, keep_offset=False):
+    def init(self, offset=None):
         """run this when starting or restarting processing"""
         self.audio_buffer = np.array([],dtype=np.float32)
         self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
-        if not keep_offset:
-            self.buffer_time_offset = 0
-            self.transcript_buffer.last_commited_time = 0
-        else:
-            self.transcript_buffer.last_commited_time = self.buffer_time_offset
-
+        self.buffer_time_offset = 0
+        if offset is not None:
+            self.buffer_time_offset = offset
+        self.transcript_buffer.last_commited_time = self.buffer_time_offset
         self.commited = []
 
     def insert_audio_chunk(self, audio):

         self.online_chunk_size = online_chunk_size
 
         self.online = OnlineASRProcessor(*a, **kw)
-        from voice_activity_controller import VoiceActivityController
-        self.vac = VoiceActivityController(use_vad_result = False)
+
+        # VAC:
+        import torch
+        model, _ = torch.hub.load(
+            repo_or_dir='snakers4/silero-vad',
+            model='silero_vad'
+        )
+        from silero_vad import VADIterator
+        self.vac = VADIterator(model)  # we use all the default options: 500ms silence, etc.  
 
         self.logfile = self.online.logfile
-
         self.init()
 
     def init(self):
         self.online.init()
         self.vac.reset_states()
         self.current_online_chunk_buffer_size = 0
+
         self.is_currently_final = False
+
+        self.status = None  # or "voice" or "nonvoice"
+        self.audio_buffer = np.array([],dtype=np.float32)
+        self.buffer_offset = 0  # in frames
+
+    def clear_buffer(self):
+        self.buffer_offset += len(self.audio_buffer)
+        self.audio_buffer = np.array([],dtype=np.float32)
 
 
     def insert_audio_chunk(self, audio):
-        r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
-        audio, is_final = r
-        print(is_final)
-        self.is_currently_final = is_final
-        self.online.insert_audio_chunk(audio)
-        self.current_online_chunk_buffer_size += len(audio)
+        res = self.vac(audio)
+        print(res)
+        self.audio_buffer = np.append(self.audio_buffer, audio)
+
+        if res is not None:
+            frame = list(res.values())[0]
+            if 'start' in res and 'end' not in res:
+                self.status = 'voice'
+                send_audio = self.audio_buffer[frame-self.buffer_offset:]
+                self.online.init(offset=frame/self.SAMPLING_RATE)
+                self.online.insert_audio_chunk(send_audio)
+                self.current_online_chunk_buffer_size += len(send_audio)
+                self.clear_buffer()
+            elif 'end' in res and 'start' not in res:
+                self.status = 'nonvoice'
+                send_audio = self.audio_buffer[:frame-self.buffer_offset]
+                self.online.insert_audio_chunk(send_audio)
+                self.current_online_chunk_buffer_size += len(send_audio)
+                self.is_currently_final = True
+                self.clear_buffer()
+            else:
+                # It doesn't happen in the current code.
+                raise NotImplemented("both start and end of voice in one chunk!!!")
+        else:
+            if self.status == 'voice':
+                self.online.insert_audio_chunk(self.audio_buffer)
+                self.current_online_chunk_buffer_size += len(self.audio_buffer)
+            if self.status is not None:
+                self.clear_buffer()
+            else:  # we are at the beginning of process, no voice has ever been detected
+                # We keep the 1s because VAD may later find start of voice in it.
+                # But trimming it to prevent OOM. 
+                self.buffer_offset += max(0,len(self.audio_buffer)-self.SAMPLING_RATE)
+                self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
+
 
     def process_iter(self):
         if self.is_currently_final:

             ret = self.online.process_iter()
             return ret
         else:
-            print("no online update, only VAD", file=self.logfile)
+            print("no online update, only VAD", self.status, file=self.logfile)
             return (None, None, "")
 
     def finish(self):
         ret = self.online.finish()
-        self.online.init(keep_offset=True)
         self.current_online_chunk_buffer_size = 0
+        self.is_currently_final = False
         return ret
 
 

Add a comment

Open 0
Closed 0

List

...	...	@@ -331,16 +331,14 @@
331	331
332	332	self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
333	333
334		- def init(self, keep_offset=False):
	334	+ def init(self, offset=None):
335	335	"""run this when starting or restarting processing"""
336	336	self.audio_buffer = np.array([],dtype=np.float32)
337	337	self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
338		- if not keep_offset:
339		- self.buffer_time_offset = 0
340		- self.transcript_buffer.last_commited_time = 0
341		- else:
342		- self.transcript_buffer.last_commited_time = self.buffer_time_offset
343		-
	338	+ self.buffer_time_offset = 0
	339	+ if offset is not None:
	340	+ self.buffer_time_offset = offset
	341	+ self.transcript_buffer.last_commited_time = self.buffer_time_offset
344	342	self.commited = []
345	343
346	344	def insert_audio_chunk(self, audio):
...	...	@@ -529,27 +527,71 @@
529	527	self.online_chunk_size = online_chunk_size
530	528
531	529	self.online = OnlineASRProcessor(a, *kw)
532		- from voice_activity_controller import VoiceActivityController
533		- self.vac = VoiceActivityController(use_vad_result = False)
	530	+
	531	+ # VAC:
	532	+ import torch
	533	+ model, _ = torch.hub.load(
	534	+ repo_or_dir='snakers4/silero-vad',
	535	+ model='silero_vad'
	536	+ )
	537	+ from silero_vad import VADIterator
	538	+ self.vac = VADIterator(model) # we use all the default options: 500ms silence, etc.
534	539
535	540	self.logfile = self.online.logfile
536		-
537	541	self.init()
538	542
539	543	def init(self):
540	544	self.online.init()
541	545	self.vac.reset_states()
542	546	self.current_online_chunk_buffer_size = 0
	547	+
543	548	self.is_currently_final = False
	549	+
	550	+ self.status = None # or "voice" or "nonvoice"
	551	+ self.audio_buffer = np.array([],dtype=np.float32)
	552	+ self.buffer_offset = 0 # in frames
	553	+
	554	+ def clear_buffer(self):
	555	+ self.buffer_offset += len(self.audio_buffer)
	556	+ self.audio_buffer = np.array([],dtype=np.float32)
544	557
545	558
546	559	def insert_audio_chunk(self, audio):
547		- r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
548		- audio, is_final = r
549		- print(is_final)
550		- self.is_currently_final = is_final
551		- self.online.insert_audio_chunk(audio)
552		- self.current_online_chunk_buffer_size += len(audio)
	560	+ res = self.vac(audio)
	561	+ print(res)
	562	+ self.audio_buffer = np.append(self.audio_buffer, audio)
	563	+
	564	+ if res is not None:
	565	+ frame = list(res.values())[0]
	566	+ if 'start' in res and 'end' not in res:
	567	+ self.status = 'voice'
	568	+ send_audio = self.audio_buffer[frame-self.buffer_offset:]
	569	+ self.online.init(offset=frame/self.SAMPLING_RATE)
	570	+ self.online.insert_audio_chunk(send_audio)
	571	+ self.current_online_chunk_buffer_size += len(send_audio)
	572	+ self.clear_buffer()
	573	+ elif 'end' in res and 'start' not in res:
	574	+ self.status = 'nonvoice'
	575	+ send_audio = self.audio_buffer[:frame-self.buffer_offset]
	576	+ self.online.insert_audio_chunk(send_audio)
	577	+ self.current_online_chunk_buffer_size += len(send_audio)
	578	+ self.is_currently_final = True
	579	+ self.clear_buffer()
	580	+ else:
	581	+ # It doesn't happen in the current code.
	582	+ raise NotImplemented("both start and end of voice in one chunk!!!")
	583	+ else:
	584	+ if self.status == 'voice':
	585	+ self.online.insert_audio_chunk(self.audio_buffer)
	586	+ self.current_online_chunk_buffer_size += len(self.audio_buffer)
	587	+ if self.status is not None:
	588	+ self.clear_buffer()
	589	+ else: # we are at the beginning of process, no voice has ever been detected
	590	+ # We keep the 1s because VAD may later find start of voice in it.
	591	+ # But trimming it to prevent OOM.
	592	+ self.buffer_offset += max(0,len(self.audio_buffer)-self.SAMPLING_RATE)
	593	+ self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
	594	+
553	595
554	596	def process_iter(self):
555	597	if self.is_currently_final:
...	...	@@ -559,13 +601,13 @@
559	601	ret = self.online.process_iter()
560	602	return ret
561	603	else:
562		- print("no online update, only VAD", file=self.logfile)
	604	+ print("no online update, only VAD", self.status, file=self.logfile)
563	605	return (None, None, "")
564	606
565	607	def finish(self):
566	608	ret = self.online.finish()
567		- self.online.init(keep_offset=True)
568	609	self.current_online_chunk_buffer_size = 0
	610	+ self.is_currently_final = False
569	611	return ret
570	612
571	613

...	...	@@ -1,111 +1,35 @@
1	1	import torch
2		-import numpy as np
	2	+from silero_vad import VADIterator
	3	+import time
3	4
4	5	class VoiceActivityController:
5		- def __init__(
6		- self,
7		- sampling_rate = 16000,
8		- min_silence_to_final_ms = 500,
9		- min_speech_to_final_ms = 100,
10		- min_silence_duration_ms = 100,
11		- use_vad_result = True,
12		-# activity_detected_callback=None,
13		- threshold =0.3
14		- ):
15		-# self.activity_detected_callback=activity_detected_callback
16		- self.model, self.utils = torch.hub.load(
	6	+ SAMPLING_RATE = 16000
	7	+ def __init__(self):
	8	+ self.model, _ = torch.hub.load(
17	9	repo_or_dir='snakers4/silero-vad',
18	10	model='silero_vad'
19	11	)
20		- # (self.get_speech_timestamps,
21		- # save_audio,
22		- # read_audio,
23		- # VADIterator,
24		- # collect_chunks) = self.utils
	12	+ # we use the default options: 500ms silence, etc.
	13	+ self.iterator = VADIterator(self.model)
25	14
26		- self.sampling_rate = sampling_rate
27		- self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000
28		- self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
29		- self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
	15	+ def reset(self):
	16	+ self.iterator.reset_states()
30	17
31		- self.use_vad_result = use_vad_result
32		- self.threshold = threshold
33		- self.reset_states()
34		-
35		- def reset_states(self):
36		- self.model.reset_states()
37		- self.temp_end = 0
38		- self.current_sample = 0
39		-
40		- self.last_silence_len= 0
41		- self.speech_len = 0
42		-
43		- def apply_vad(self, audio):
44		- """
45		- returns: triple
46		- (voice_audio,
47		- speech_in_wav,
48		- silence_in_wav)
49		-
50		- """
51		- print("applying vad here")
	18	+ def __call__(self, audio):
	19	+ '''
	20	+ audio: audio chunk in the current np.array format
	21	+ returns:
	22	+ - { 'start': time_frame } ... when voice start was detected. time_frame is number of frame (can be converted to seconds)
	23	+ - { 'end': time_frame } ... when voice end is detected
	24	+ - None ... when no change detected by current chunk
	25	+ '''
52	26	x = audio
53		- if not torch.is_tensor(x):
54		- try:
55		- x = torch.Tensor(x)
56		- except:
57		- raise TypeError("Audio cannot be casted to tensor. Cast it manually")
58		-
59		- speech_prob = self.model(x, self.sampling_rate).item()
60		- print("speech_prob",speech_prob)
61		-
62		- window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
63		- self.current_sample += window_size_samples
64		-
65		- if speech_prob >= self.threshold: # speech is detected
66		- self.temp_end = 0
67		- return audio, window_size_samples, 0
68		-
69		- else: # silence detected, counting w
70		- if not self.temp_end:
71		- self.temp_end = self.current_sample
72		-
73		- if self.current_sample - self.temp_end < self.min_silence_samples:
74		- return audio, 0, window_size_samples
75		- else:
76		- return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples
77		-
78		-
79		- def detect_speech_iter(self, data, audio_in_int16 = False):
80		- audio_block = data
81		- wav = audio_block
82		-
83		- is_final = False
84		- voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
85		- print("speech, last silence",speech_in_wav, last_silent_in_wav)
86		-
87		-
88		- if speech_in_wav > 0 :
89		- self.last_silence_len= 0
90		- self.speech_len += speech_in_wav
91		-# if self.activity_detected_callback is not None:
92		-# self.activity_detected_callback()
93		-
94		- self.last_silence_len += last_silent_in_wav
95		- print("self.last_silence_len",self.last_silence_len, self.final_silence_limit,self.last_silence_len>= self.final_silence_limit)
96		- print("self.speech_len, final_speech_limit",self.speech_len , self.final_speech_limit,self.speech_len >= self.final_speech_limit)
97		- if self.last_silence_len>= self.final_silence_limit and self.speech_len >= self.final_speech_limit:
98		- for i in range(10): print("TADY!!!")
99		-
100		- is_final = True
101		- self.last_silence_len= 0
102		- self.speech_len = 0
103		-
104		- return voice_audio, is_final
105		-
106		- def detect_user_speech(self, audio_stream, audio_in_int16 = False):
107		- self.last_silence_len= 0
108		- self.speech_len = 0
109		-
110		- for data in audio_stream: # replace with your condition of choice
111		- yield self.detect_speech_iter(data, audio_in_int16)
	27	+# if not torch.is_tensor(x):
	28	+# try:
	29	+# x = torch.Tensor(x)
	30	+# except:
	31	+# raise TypeError("Audio cannot be casted to tensor. Cast it manually")
	32	+ t = time.time()
	33	+ a = self.iterator(x)
	34	+ print("VAD took ",time.time()-t,"seconds")
	35	+ return a

Delete comment