

fixed silero vad chunk size
issues #141 #121 #142 #136 etc.
@6e80c9dea8fd5887d48240428629876bc79a94b9
--- silero_vad.py
+++ silero_vad_iterator.py
... | ... | @@ -2,6 +2,7 @@ |
2 | 2 |
|
3 | 3 |
# This is copied from silero-vad's vad_utils.py: |
4 | 4 |
# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340 |
5 |
+# (except changed defaults) |
|
5 | 6 |
|
6 | 7 |
# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE |
7 | 8 |
|
... | ... | @@ -10,8 +11,8 @@ |
10 | 11 |
model, |
11 | 12 |
threshold: float = 0.5, |
12 | 13 |
sampling_rate: int = 16000, |
13 |
- min_silence_duration_ms: int = 100, |
|
14 |
- speech_pad_ms: int = 30 |
|
14 |
+ min_silence_duration_ms: int = 500, # makes sense on one recording that I checked |
|
15 |
+ speech_pad_ms: int = 100 # same |
|
15 | 16 |
): |
16 | 17 |
|
17 | 18 |
""" |
... | ... | @@ -95,11 +96,14 @@ |
95 | 96 |
return None |
96 | 97 |
|
97 | 98 |
####################### |
98 |
-# this is our workaround for Silero v5 requiring at least 512-sized audio chunks |
|
99 |
-# (see https://github.com/ufal/whisper_streaming/issues/116 ) |
|
99 |
+# because Silero now requires exactly 512-sized audio chunks |
|
100 | 100 |
|
101 | 101 |
import numpy as np |
102 | 102 |
class FixedVADIterator(VADIterator): |
103 |
+ '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once. |
|
104 |
+ If audio to be processed at once is long and multiple voiced segments detected, |
|
105 |
+ then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment. |
|
106 |
+ ''' |
|
103 | 107 |
|
104 | 108 |
def reset_states(self): |
105 | 109 |
super().reset_states() |
... | ... | @@ -107,11 +111,19 @@ |
107 | 111 |
|
108 | 112 |
def __call__(self, x, return_seconds=False): |
109 | 113 |
self.buffer = np.append(self.buffer, x) |
110 |
- if len(self.buffer) >= 512: |
|
111 |
- ret = super().__call__(self.buffer, return_seconds=return_seconds) |
|
112 |
- self.buffer = np.array([],dtype=np.float32) |
|
113 |
- return ret |
|
114 |
- return None |
|
114 |
+ ret = None |
|
115 |
+ while len(self.buffer) >= 512: |
|
116 |
+ r = super().__call__(self.buffer[:512], return_seconds=return_seconds) |
|
117 |
+ self.buffer = self.buffer[512:] |
|
118 |
+ if ret is None: |
|
119 |
+ ret = r |
|
120 |
+ elif r is not None: |
|
121 |
+ if 'end' in r: |
|
122 |
+ ret['end'] = r['end'] # the latter end |
|
123 |
+ if 'start' in r and 'end' in ret: # there is an earlier start. |
|
124 |
+ # Remove end, merging this segment with the previous one. |
|
125 |
+ del ret['end'] |
|
126 |
+ return ret if ret != {} else None |
|
115 | 127 |
|
116 | 128 |
if __name__ == "__main__": |
117 | 129 |
# test/demonstrate the need for FixedVADIterator: |
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -534,8 +534,8 @@ |
534 | 534 |
repo_or_dir='snakers4/silero-vad', |
535 | 535 |
model='silero_vad' |
536 | 536 |
) |
537 |
- from silero_vad import FixedVADIterator |
|
538 |
- self.vac = FixedVADIterator(model) # we use all the default options: 500ms silence, etc. |
|
537 |
+ from silero_vad_iterator import FixedVADIterator |
|
538 |
+ self.vac = FixedVADIterator(model) # we use the default options there: 500ms silence, 100ms padding, etc. |
|
539 | 539 |
|
540 | 540 |
self.logfile = self.online.logfile |
541 | 541 |
self.init() |
... | ... | @@ -561,24 +561,31 @@ |
561 | 561 |
self.audio_buffer = np.append(self.audio_buffer, audio) |
562 | 562 |
|
563 | 563 |
if res is not None: |
564 |
- frame = list(res.values())[0] |
|
564 |
+ frame = list(res.values())[0]-self.buffer_offset |
|
565 | 565 |
if 'start' in res and 'end' not in res: |
566 | 566 |
self.status = 'voice' |
567 |
- send_audio = self.audio_buffer[frame-self.buffer_offset:] |
|
568 |
- self.online.init(offset=frame/self.SAMPLING_RATE) |
|
567 |
+ send_audio = self.audio_buffer[frame:] |
|
568 |
+ self.online.init(offset=(frame+self.buffer_offset)/self.SAMPLING_RATE) |
|
569 | 569 |
self.online.insert_audio_chunk(send_audio) |
570 | 570 |
self.current_online_chunk_buffer_size += len(send_audio) |
571 | 571 |
self.clear_buffer() |
572 | 572 |
elif 'end' in res and 'start' not in res: |
573 | 573 |
self.status = 'nonvoice' |
574 |
- send_audio = self.audio_buffer[:frame-self.buffer_offset] |
|
574 |
+ send_audio = self.audio_buffer[:frame] |
|
575 | 575 |
self.online.insert_audio_chunk(send_audio) |
576 | 576 |
self.current_online_chunk_buffer_size += len(send_audio) |
577 | 577 |
self.is_currently_final = True |
578 | 578 |
self.clear_buffer() |
579 | 579 |
else: |
580 |
- # It doesn't happen in the current code. |
|
581 |
- raise NotImplemented("both start and end of voice in one chunk!!!") |
|
580 |
+ beg = res["start"]-self.buffer_offset |
|
581 |
+ end = res["end"]-self.buffer_offset |
|
582 |
+ self.status = 'nonvoice' |
|
583 |
+ send_audio = self.audio_buffer[beg:end] |
|
584 |
+ self.online.init(offset=(beg+self.buffer_offset)/self.SAMPLING_RATE) |
|
585 |
+ self.online.insert_audio_chunk(send_audio) |
|
586 |
+ self.current_online_chunk_buffer_size += len(send_audio) |
|
587 |
+ self.is_currently_final = True |
|
588 |
+ self.clear_buffer() |
|
582 | 589 |
else: |
583 | 590 |
if self.status == 'voice': |
584 | 591 |
self.online.insert_audio_chunk(self.audio_buffer) |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?