

vad
@60c7aa643b1641fc59e37f8fb1dd3305a568f8d1
--- mic_test_whisper_simple.py
+++ mic_test_whisper_simple.py
... | ... | @@ -72,12 +72,12 @@ |
72 | 72 |
model = "large-v2" |
73 | 73 |
src_lan = "en" # source language |
74 | 74 |
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used |
75 |
-use_vad_result = True |
|
75 |
+use_vad = False |
|
76 | 76 |
min_sample_length = 1 * SAMPLING_RATE |
77 | 77 |
|
78 | 78 |
|
79 | 79 |
|
80 |
-vad = VoiceActivityController(use_vad_result = use_vad_result) |
|
80 |
+vac = VoiceActivityController(use_vad_result = use_vad) |
|
81 | 81 |
asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model |
82 | 82 |
|
83 | 83 |
tokenizer = create_tokenizer(tgt_lan) |
... | ... | @@ -85,7 +85,7 @@ |
85 | 85 |
|
86 | 86 |
|
87 | 87 |
stream = MicrophoneStream() |
88 |
-stream = vad.detect_user_speech(stream, audio_in_int16 = False) |
|
88 |
+stream = vac.detect_user_speech(stream, audio_in_int16 = False) |
|
89 | 89 |
stream = online.stream_process(stream) |
90 | 90 |
|
91 | 91 |
for isFinal, text in stream: |
--- mic_test_whisper_streaming.py
+++ mic_test_whisper_streaming.py
... | ... | @@ -13,7 +13,7 @@ |
13 | 13 |
src_lan = "en" # source language |
14 | 14 |
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used |
15 | 15 |
use_vad_result = True |
16 |
-min_sample_length = 1.5 * SAMPLING_RATE |
|
16 |
+min_sample_length = 1 * SAMPLING_RATE |
|
17 | 17 |
|
18 | 18 |
|
19 | 19 |
|
... | ... | @@ -54,12 +54,12 @@ |
54 | 54 |
|
55 | 55 |
if is_final: |
56 | 56 |
o = online.finish() |
57 |
- online.init() |
|
58 | 57 |
# final_processing_pending = False |
59 | 58 |
print('-----'*10) |
60 | 59 |
complete_text = complete_text + o[2] |
61 | 60 |
print('FINAL - '+ complete_text) # do something with current partial output |
62 | 61 |
print('-----'*10) |
62 |
+ online.init() |
|
63 | 63 |
out = [] |
64 | 64 |
out_len = 0 |
65 | 65 |
|
--- voice_activity_controller.py
+++ voice_activity_controller.py
... | ... | @@ -76,7 +76,7 @@ |
76 | 76 |
if self.current_sample - self.temp_end < self.min_silence_samples: |
77 | 77 |
return audio, 0, window_size_samples |
78 | 78 |
else: |
79 |
- return np.array([], dtype=np.float16) , 0, window_size_samples |
|
79 |
+ return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples |
|
80 | 80 |
|
81 | 81 |
|
82 | 82 |
|
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?