Rodrigo 2023-12-10
vad
@60c7aa643b1641fc59e37f8fb1dd3305a568f8d1
mic_test_whisper_simple.py
--- mic_test_whisper_simple.py
+++ mic_test_whisper_simple.py
@@ -72,12 +72,12 @@
 model = "large-v2"
 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
-use_vad_result = True
+use_vad = False
 min_sample_length = 1 * SAMPLING_RATE
 
 
 
-vad = VoiceActivityController(use_vad_result = use_vad_result)
+vac = VoiceActivityController(use_vad_result = use_vad)
 asr = FasterWhisperASR(src_lan, "large-v2")  # loads and wraps Whisper model
 
 tokenizer = create_tokenizer(tgt_lan)
@@ -85,7 +85,7 @@
 
 
 stream = MicrophoneStream()
-stream = vad.detect_user_speech(stream, audio_in_int16 = False) 
+stream = vac.detect_user_speech(stream, audio_in_int16 = False) 
 stream = online.stream_process(stream)
 
 for isFinal, text in stream:
mic_test_whisper_streaming.py
--- mic_test_whisper_streaming.py
+++ mic_test_whisper_streaming.py
@@ -13,7 +13,7 @@
 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
 use_vad_result = True
-min_sample_length = 1.5 * SAMPLING_RATE
+min_sample_length = 1 * SAMPLING_RATE
 
 
 
@@ -54,12 +54,12 @@
 
     if is_final:
         o = online.finish()
-        online.init()   
         # final_processing_pending = False         
         print('-----'*10)
         complete_text = complete_text + o[2]
         print('FINAL - '+ complete_text) # do something with current partial output
         print('-----'*10)   
+        online.init()   
         out = []
         out_len = 0    
         
voice_activity_controller.py
--- voice_activity_controller.py
+++ voice_activity_controller.py
@@ -76,7 +76,7 @@
             if self.current_sample - self.temp_end < self.min_silence_samples:
                 return audio, 0, window_size_samples
             else:
-                return np.array([], dtype=np.float16) , 0, window_size_samples
+                return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples
 
 
 
Add a comment
List