Commit @6e80c9dea8fd5887d48240428629876bc79a94b9 - yjyoon/whisper_streaming

Dominik Macháček 2024-11-29

fixed silero vad chunk size

issues #141 #121 #142 #136 etc.

@6e80c9dea8fd5887d48240428629876bc79a94b9

41dd3c2

6e80c9d

silero_vad_iterator.py (Renamed from silero_vad.py)

--- silero_vad.py

+++ silero_vad_iterator.py


 
 # This is copied from silero-vad's vad_utils.py:
 # https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
+# (except changed defaults)
 
 # Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
 

                  model,
                  threshold: float = 0.5,
                  sampling_rate: int = 16000,
-                 min_silence_duration_ms: int = 100,
-                 speech_pad_ms: int = 30
+                 min_silence_duration_ms: int = 500,  # makes sense on one recording that I checked
+                 speech_pad_ms: int = 100             # same 
                  ):
 
         """

         return None
 
 #######################
-# this is our workaround for Silero v5 requiring at least 512-sized audio chunks 
-# (see https://github.com/ufal/whisper_streaming/issues/116 )
+# because Silero now requires exactly 512-sized audio chunks 
 
 import numpy as np
 class FixedVADIterator(VADIterator):
+    '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
+    If audio to be processed at once is long and multiple voiced segments detected, 
+    then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment. 
+    '''
 
     def reset_states(self):
         super().reset_states()

 
     def __call__(self, x, return_seconds=False):
         self.buffer = np.append(self.buffer, x) 
-        if len(self.buffer) >= 512:
-            ret = super().__call__(self.buffer, return_seconds=return_seconds)
-            self.buffer = np.array([],dtype=np.float32)
-            return ret
-        return None
+        ret = None
+        while len(self.buffer) >= 512:
+            r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
+            self.buffer = self.buffer[512:]
+            if ret is None:
+                ret = r
+            elif r is not None:
+                if 'end' in r:
+                    ret['end'] = r['end']  # the latter end
+                if 'start' in r and 'end' in ret:  # there is an earlier start.
+                    # Remove end, merging this segment with the previous one.
+                    del ret['end']
+        return ret if ret != {} else None
 
 if __name__ == "__main__":
     # test/demonstrate the need for FixedVADIterator:

41dd3c2

6e80c9d

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


             repo_or_dir='snakers4/silero-vad',
             model='silero_vad'
         )
-        from silero_vad import FixedVADIterator
-        self.vac = FixedVADIterator(model)  # we use all the default options: 500ms silence, etc.  
+        from silero_vad_iterator import FixedVADIterator
+        self.vac = FixedVADIterator(model)  # we use the default options there: 500ms silence, 100ms padding, etc.  
 
         self.logfile = self.online.logfile
         self.init()

         self.audio_buffer = np.append(self.audio_buffer, audio)
 
         if res is not None:
-            frame = list(res.values())[0]
+            frame = list(res.values())[0]-self.buffer_offset
             if 'start' in res and 'end' not in res:
                 self.status = 'voice'
-                send_audio = self.audio_buffer[frame-self.buffer_offset:]
-                self.online.init(offset=frame/self.SAMPLING_RATE)
+                send_audio = self.audio_buffer[frame:]
+                self.online.init(offset=(frame+self.buffer_offset)/self.SAMPLING_RATE)
                 self.online.insert_audio_chunk(send_audio)
                 self.current_online_chunk_buffer_size += len(send_audio)
                 self.clear_buffer()
             elif 'end' in res and 'start' not in res:
                 self.status = 'nonvoice'
-                send_audio = self.audio_buffer[:frame-self.buffer_offset]
+                send_audio = self.audio_buffer[:frame]
                 self.online.insert_audio_chunk(send_audio)
                 self.current_online_chunk_buffer_size += len(send_audio)
                 self.is_currently_final = True
                 self.clear_buffer()
             else:
-                # It doesn't happen in the current code.
-                raise NotImplemented("both start and end of voice in one chunk!!!")
+                beg = res["start"]-self.buffer_offset
+                end = res["end"]-self.buffer_offset
+                self.status = 'nonvoice'
+                send_audio = self.audio_buffer[beg:end]
+                self.online.init(offset=(beg+self.buffer_offset)/self.SAMPLING_RATE)
+                self.online.insert_audio_chunk(send_audio)
+                self.current_online_chunk_buffer_size += len(send_audio)
+                self.is_currently_final = True
+                self.clear_buffer()
         else:
             if self.status == 'voice':
                 self.online.insert_audio_chunk(self.audio_buffer)

Add a comment

Open 0
Closed 0

List

...	...	@@ -534,8 +534,8 @@
534	534	repo_or_dir='snakers4/silero-vad',
535	535	model='silero_vad'
536	536	)
537		- from silero_vad import FixedVADIterator
538		- self.vac = FixedVADIterator(model) # we use all the default options: 500ms silence, etc.
	537	+ from silero_vad_iterator import FixedVADIterator
	538	+ self.vac = FixedVADIterator(model) # we use the default options there: 500ms silence, 100ms padding, etc.
539	539
540	540	self.logfile = self.online.logfile
541	541	self.init()
...	...	@@ -561,24 +561,31 @@
561	561	self.audio_buffer = np.append(self.audio_buffer, audio)
562	562
563	563	if res is not None:
564		- frame = list(res.values())[0]
	564	+ frame = list(res.values())[0]-self.buffer_offset
565	565	if 'start' in res and 'end' not in res:
566	566	self.status = 'voice'
567		- send_audio = self.audio_buffer[frame-self.buffer_offset:]
568		- self.online.init(offset=frame/self.SAMPLING_RATE)
	567	+ send_audio = self.audio_buffer[frame:]
	568	+ self.online.init(offset=(frame+self.buffer_offset)/self.SAMPLING_RATE)
569	569	self.online.insert_audio_chunk(send_audio)
570	570	self.current_online_chunk_buffer_size += len(send_audio)
571	571	self.clear_buffer()
572	572	elif 'end' in res and 'start' not in res:
573	573	self.status = 'nonvoice'
574		- send_audio = self.audio_buffer[:frame-self.buffer_offset]
	574	+ send_audio = self.audio_buffer[:frame]
575	575	self.online.insert_audio_chunk(send_audio)
576	576	self.current_online_chunk_buffer_size += len(send_audio)
577	577	self.is_currently_final = True
578	578	self.clear_buffer()
579	579	else:
580		- # It doesn't happen in the current code.
581		- raise NotImplemented("both start and end of voice in one chunk!!!")
	580	+ beg = res["start"]-self.buffer_offset
	581	+ end = res["end"]-self.buffer_offset
	582	+ self.status = 'nonvoice'
	583	+ send_audio = self.audio_buffer[beg:end]
	584	+ self.online.init(offset=(beg+self.buffer_offset)/self.SAMPLING_RATE)
	585	+ self.online.insert_audio_chunk(send_audio)
	586	+ self.current_online_chunk_buffer_size += len(send_audio)
	587	+ self.is_currently_final = True
	588	+ self.clear_buffer()
582	589	else:
583	590	if self.status == 'voice':
584	591	self.online.insert_audio_chunk(self.audio_buffer)

...	...	@@ -2,6 +2,7 @@
2	2
3	3	# This is copied from silero-vad's vad_utils.py:
4	4	# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
	5	+# (except changed defaults)
5	6
6	7	# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
7	8
...	...	@@ -10,8 +11,8 @@
10	11	model,
11	12	threshold: float = 0.5,
12	13	sampling_rate: int = 16000,
13		- min_silence_duration_ms: int = 100,
14		- speech_pad_ms: int = 30
	14	+ min_silence_duration_ms: int = 500, # makes sense on one recording that I checked
	15	+ speech_pad_ms: int = 100 # same
15	16	):
16	17
17	18	"""
...	...	@@ -95,11 +96,14 @@
95	96	return None
96	97
97	98	#######################
98		-# this is our workaround for Silero v5 requiring at least 512-sized audio chunks
99		-# (see https://github.com/ufal/whisper_streaming/issues/116 )
	99	+# because Silero now requires exactly 512-sized audio chunks
100	100
101	101	import numpy as np
102	102	class FixedVADIterator(VADIterator):
	103	+ '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
	104	+ If audio to be processed at once is long and multiple voiced segments detected,
	105	+ then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
	106	+ '''
103	107
104	108	def reset_states(self):
105	109	super().reset_states()
...	...	@@ -107,11 +111,19 @@
107	111
108	112	def __call__(self, x, return_seconds=False):
109	113	self.buffer = np.append(self.buffer, x)
110		- if len(self.buffer) >= 512:
111		- ret = super().__call__(self.buffer, return_seconds=return_seconds)
112		- self.buffer = np.array([],dtype=np.float32)
113		- return ret
114		- return None
	114	+ ret = None
	115	+ while len(self.buffer) >= 512:
	116	+ r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
	117	+ self.buffer = self.buffer[512:]
	118	+ if ret is None:
	119	+ ret = r
	120	+ elif r is not None:
	121	+ if 'end' in r:
	122	+ ret['end'] = r['end'] # the latter end
	123	+ if 'start' in r and 'end' in ret: # there is an earlier start.
	124	+ # Remove end, merging this segment with the previous one.
	125	+ del ret['end']
	126	+ return ret if ret != {} else None
115	127
116	128	if __name__ == "__main__":
117	129	# test/demonstrate the need for FixedVADIterator:

Delete comment