

Merge remote-tracking branch 'rodrigo/main' into vad-streaming
@ecee2fbef721857dd270d51988135390c5291fcb
+++ mic_test_whisper_simple.py
... | ... | @@ -0,0 +1,95 @@ |
1 | +from microphone_stream import MicrophoneStream | |
2 | +from voice_activity_controller import VoiceActivityController | |
3 | +from whisper_online import * | |
4 | +import numpy as np | |
5 | +import librosa | |
6 | +import io | |
7 | +import soundfile | |
8 | +import sys | |
9 | + | |
10 | + | |
11 | + | |
12 | + | |
13 | +class SimpleASRProcessor: | |
14 | + | |
15 | + def __init__(self, asr, sampling_rate = 16000): | |
16 | + """run this when starting or restarting processing""" | |
17 | + self.audio_buffer = np.array([],dtype=np.float32) | |
18 | + self.prompt_buffer = "" | |
19 | + self.asr = asr | |
20 | + self.sampling_rate = sampling_rate | |
21 | + self.init_prompt = '' | |
22 | + | |
23 | + def ts_words(self, segments): | |
24 | + result = "" | |
25 | + for segment in segments: | |
26 | + if segment.no_speech_prob > 0.9: | |
27 | + continue | |
28 | + for word in segment.words: | |
29 | + w = word.word | |
30 | + t = (word.start, word.end, w) | |
31 | + result +=w | |
32 | + return result | |
33 | + | |
34 | + def stream_process(self, vad_result): | |
35 | + iter_in_phrase = 0 | |
36 | + for chunk, is_final in vad_result: | |
37 | + iter_in_phrase += 1 | |
38 | + | |
39 | + if chunk is not None: | |
40 | + sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") | |
41 | + audio, _ = librosa.load(sf,sr=SAMPLING_RATE) | |
42 | + out = [] | |
43 | + out.append(audio) | |
44 | + a = np.concatenate(out) | |
45 | + self.audio_buffer = np.append(self.audio_buffer, a) | |
46 | + | |
47 | + if is_final and len(self.audio_buffer) > 0: | |
48 | + res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) | |
49 | + tsw = self.ts_words(res) | |
50 | + | |
51 | + self.init_prompt = self.init_prompt + tsw | |
52 | + self.init_prompt = self.init_prompt [-100:] | |
53 | + self.audio_buffer.resize(0) | |
54 | + iter_in_phrase =0 | |
55 | + | |
56 | + yield True, tsw | |
57 | + # show progress evry 50 chunks | |
58 | + elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0: | |
59 | + res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) | |
60 | + # use custom ts_words | |
61 | + tsw = self.ts_words(res) | |
62 | + yield False, tsw | |
63 | + | |
64 | + | |
65 | + | |
66 | + | |
67 | + | |
68 | + | |
69 | + | |
70 | +SAMPLING_RATE = 16000 | |
71 | + | |
72 | +model = "large-v2" | |
73 | +src_lan = "en" # source language | |
74 | +tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used | |
75 | +use_vad = False | |
76 | +min_sample_length = 1 * SAMPLING_RATE | |
77 | + | |
78 | + | |
79 | + | |
80 | +vac = VoiceActivityController(use_vad_result = use_vad) | |
81 | +asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model | |
82 | + | |
83 | +tokenizer = create_tokenizer(tgt_lan) | |
84 | +online = SimpleASRProcessor(asr) | |
85 | + | |
86 | + | |
87 | +stream = MicrophoneStream() | |
88 | +stream = vac.detect_user_speech(stream, audio_in_int16 = False) | |
89 | +stream = online.stream_process(stream) | |
90 | + | |
91 | +for isFinal, text in stream: | |
92 | + if isFinal: | |
93 | + print( text, end="\r\n") | |
94 | + else: | |
95 | + print( text, end="\r") |
+++ mic_test_whisper_streaming.py
... | ... | @@ -0,0 +1,71 @@ |
1 | +from microphone_stream import MicrophoneStream | |
2 | +from voice_activity_controller import VoiceActivityController | |
3 | +from whisper_online import * | |
4 | +import numpy as np | |
5 | +import librosa | |
6 | +import io | |
7 | +import soundfile | |
8 | +import sys | |
9 | + | |
10 | + | |
11 | +SAMPLING_RATE = 16000 | |
12 | +model = "large-v2" | |
13 | +src_lan = "en" # source language | |
14 | +tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used | |
15 | +use_vad_result = True | |
16 | +min_sample_length = 1 * SAMPLING_RATE | |
17 | + | |
18 | + | |
19 | + | |
20 | +asr = FasterWhisperASR(src_lan, model) # loads and wraps Whisper model | |
21 | +tokenizer = create_tokenizer(tgt_lan) # sentence segmenter for the target language | |
22 | +online = OnlineASRProcessor(asr, tokenizer) # create processing object | |
23 | + | |
24 | +microphone_stream = MicrophoneStream() | |
25 | +vad = VoiceActivityController(use_vad_result = use_vad_result) | |
26 | + | |
27 | +complete_text = '' | |
28 | +final_processing_pending = False | |
29 | +out = [] | |
30 | +out_len = 0 | |
31 | +for iter in vad.detect_user_speech(microphone_stream): # processing loop: | |
32 | + raw_bytes= iter[0] | |
33 | + is_final = iter[1] | |
34 | + | |
35 | + if raw_bytes: | |
36 | + sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") | |
37 | + audio, _ = librosa.load(sf,sr=SAMPLING_RATE) | |
38 | + out.append(audio) | |
39 | + out_len += len(audio) | |
40 | + | |
41 | + | |
42 | + if (is_final or out_len >= min_sample_length) and out_len>0: | |
43 | + a = np.concatenate(out) | |
44 | + online.insert_audio_chunk(a) | |
45 | + | |
46 | + if out_len > min_sample_length: | |
47 | + o = online.process_iter() | |
48 | + print('-----'*10) | |
49 | + complete_text = complete_text + o[2] | |
50 | + print('PARTIAL - '+ complete_text) # do something with current partial output | |
51 | + print('-----'*10) | |
52 | + out = [] | |
53 | + out_len = 0 | |
54 | + | |
55 | + if is_final: | |
56 | + o = online.finish() | |
57 | + # final_processing_pending = False | |
58 | + print('-----'*10) | |
59 | + complete_text = complete_text + o[2] | |
60 | + print('FINAL - '+ complete_text) # do something with current partial output | |
61 | + print('-----'*10) | |
62 | + online.init() | |
63 | + out = [] | |
64 | + out_len = 0 | |
65 | + | |
66 | + | |
67 | + | |
68 | + | |
69 | + | |
70 | + | |
71 | + |
+++ microphone_stream.py
... | ... | @@ -0,0 +1,82 @@ |
1 | + | |
2 | + | |
3 | +### mic stream | |
4 | + | |
5 | +import queue | |
6 | +import re | |
7 | +import sys | |
8 | +import pyaudio | |
9 | + | |
10 | + | |
11 | +class MicrophoneStream: | |
12 | + def __init__( | |
13 | + self, | |
14 | + sample_rate: int = 16000, | |
15 | + ): | |
16 | + """ | |
17 | + Creates a stream of audio from the microphone. | |
18 | + | |
19 | + Args: | |
20 | + chunk_size: The size of each chunk of audio to read from the microphone. | |
21 | + channels: The number of channels to record audio from. | |
22 | + sample_rate: The sample rate to record audio at. | |
23 | + """ | |
24 | + try: | |
25 | + import pyaudio | |
26 | + except ImportError: | |
27 | + raise Exception('py audio not installed') | |
28 | + | |
29 | + self._pyaudio = pyaudio.PyAudio() | |
30 | + self.sample_rate = sample_rate | |
31 | + | |
32 | + self._chunk_size = int(self.sample_rate * 40 / 1000) | |
33 | + self._stream = self._pyaudio.open( | |
34 | + format=pyaudio.paInt16, | |
35 | + channels=1, | |
36 | + rate=sample_rate, | |
37 | + input=True, | |
38 | + frames_per_buffer=self._chunk_size, | |
39 | + ) | |
40 | + | |
41 | + self._open = True | |
42 | + | |
43 | + def __iter__(self): | |
44 | + """ | |
45 | + Returns the iterator object. | |
46 | + """ | |
47 | + | |
48 | + return self | |
49 | + | |
50 | + def __next__(self): | |
51 | + """ | |
52 | + Reads a chunk of audio from the microphone. | |
53 | + """ | |
54 | + if not self._open: | |
55 | + raise StopIteration | |
56 | + | |
57 | + try: | |
58 | + return self._stream.read(self._chunk_size) | |
59 | + except KeyboardInterrupt: | |
60 | + raise StopIteration | |
61 | + | |
62 | + def close(self): | |
63 | + """ | |
64 | + Closes the stream. | |
65 | + """ | |
66 | + | |
67 | + self._open = False | |
68 | + | |
69 | + if self._stream.is_active(): | |
70 | + self._stream.stop_stream() | |
71 | + | |
72 | + self._stream.close() | |
73 | + self._pyaudio.terminate() | |
74 | + | |
75 | + | |
76 | + | |
77 | + | |
78 | + | |
79 | + | |
80 | + | |
81 | + | |
82 | + |
+++ voice_activity_controller.py
... | ... | @@ -0,0 +1,119 @@ |
1 | +import torch | |
2 | +import numpy as np | |
3 | +# import sounddevice as sd | |
4 | +import torch | |
5 | +import numpy as np | |
6 | +import datetime | |
7 | + | |
8 | + | |
9 | +def int2float(sound): | |
10 | + abs_max = np.abs(sound).max() | |
11 | + sound = sound.astype('float32') | |
12 | + if abs_max > 0: | |
13 | + sound *= 1/32768 | |
14 | + sound = sound.squeeze() # depends on the use case | |
15 | + return sound | |
16 | + | |
17 | +class VoiceActivityController: | |
18 | + def __init__( | |
19 | + self, | |
20 | + sampling_rate = 16000, | |
21 | + min_silence_to_final_ms = 500, | |
22 | + min_speech_to_final_ms = 100, | |
23 | + min_silence_duration_ms = 100, | |
24 | + use_vad_result = True, | |
25 | + activity_detected_callback=None, | |
26 | + threshold =0.3 | |
27 | + ): | |
28 | + self.activity_detected_callback=activity_detected_callback | |
29 | + self.model, self.utils = torch.hub.load( | |
30 | + repo_or_dir='snakers4/silero-vad', | |
31 | + model='silero_vad' | |
32 | + ) | |
33 | + # (self.get_speech_timestamps, | |
34 | + # save_audio, | |
35 | + # read_audio, | |
36 | + # VADIterator, | |
37 | + # collect_chunks) = self.utils | |
38 | + | |
39 | + self.sampling_rate = sampling_rate | |
40 | + self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000 | |
41 | + self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000 | |
42 | + self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 | |
43 | + | |
44 | + self.use_vad_result = use_vad_result | |
45 | + self.last_marked_chunk = None | |
46 | + self.threshold = threshold | |
47 | + self.reset_states() | |
48 | + | |
49 | + def reset_states(self): | |
50 | + self.model.reset_states() | |
51 | + self.temp_end = 0 | |
52 | + self.current_sample = 0 | |
53 | + | |
54 | + def apply_vad(self, audio): | |
55 | + x = int2float(audio) | |
56 | + if not torch.is_tensor(x): | |
57 | + try: | |
58 | + x = torch.Tensor(x) | |
59 | + except: | |
60 | + raise TypeError("Audio cannot be casted to tensor. Cast it manually") | |
61 | + | |
62 | + speech_prob = self.model(x, self.sampling_rate).item() | |
63 | + | |
64 | + window_size_samples = len(x[0]) if x.dim() == 2 else len(x) | |
65 | + self.current_sample += window_size_samples | |
66 | + | |
67 | + | |
68 | + if (speech_prob >= self.threshold): | |
69 | + self.temp_end = 0 | |
70 | + return audio, window_size_samples, 0 | |
71 | + | |
72 | + else : | |
73 | + if not self.temp_end: | |
74 | + self.temp_end = self.current_sample | |
75 | + | |
76 | + if self.current_sample - self.temp_end < self.min_silence_samples: | |
77 | + return audio, 0, window_size_samples | |
78 | + else: | |
79 | + return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples | |
80 | + | |
81 | + | |
82 | + | |
83 | + | |
84 | + | |
85 | + def detect_user_speech(self, audio_stream, audio_in_int16 = False): | |
86 | + last_silence_len= 0 | |
87 | + speech_len = 0 | |
88 | + | |
89 | + for data in audio_stream: # replace with your condition of choice | |
90 | + | |
91 | + | |
92 | + audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data | |
93 | + wav = audio_block | |
94 | + | |
95 | + is_final = False | |
96 | + voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav) | |
97 | + | |
98 | + | |
99 | + if speech_in_wav > 0 : | |
100 | + last_silence_len= 0 | |
101 | + speech_len += speech_in_wav | |
102 | + if self.activity_detected_callback is not None: | |
103 | + self.activity_detected_callback() | |
104 | + | |
105 | + last_silence_len += last_silent_in_wav | |
106 | + if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit: | |
107 | + | |
108 | + is_final = True | |
109 | + last_silence_len= 0 | |
110 | + speech_len = 0 | |
111 | + | |
112 | + yield voice_audio.tobytes(), is_final | |
113 | + | |
114 | + | |
115 | + | |
116 | + | |
117 | + | |
118 | + | |
119 | + |
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -4,7 +4,7 @@ |
4 | 4 |
import librosa |
5 | 5 |
from functools import lru_cache |
6 | 6 |
import time |
7 |
- |
|
7 |
+import datetime |
|
8 | 8 |
|
9 | 9 |
|
10 | 10 |
@lru_cache |
... | ... | @@ -118,14 +118,21 @@ |
118 | 118 |
return model |
119 | 119 |
|
120 | 120 |
def transcribe(self, audio, init_prompt=""): |
121 |
+ |
|
122 |
+ # tiempo_inicio = datetime.datetime.now() |
|
121 | 123 |
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01) |
122 | 124 |
segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs) |
125 |
+ |
|
126 |
+ # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe take { (datetime.datetime.now() -tiempo_inicio) } ms.') |
|
127 |
+ |
|
123 | 128 |
return list(segments) |
124 | 129 |
|
125 | 130 |
def ts_words(self, segments): |
126 | 131 |
o = [] |
127 | 132 |
for segment in segments: |
128 | 133 |
for word in segment.words: |
134 |
+ if segment.no_speech_prob > 0.9: |
|
135 |
+ continue |
|
129 | 136 |
# not stripping the spaces -- should not be merged with them! |
130 | 137 |
w = word.word |
131 | 138 |
t = (word.start, word.end, w) |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?