

vad
@5b1edda944f525cf51a567a35d666c97872f2c4c
+++ mic_test_whisper_simple.py
... | ... | @@ -0,0 +1,95 @@ |
1 | +from microphone_stream import MicrophoneStream | |
2 | +from voice_activity_controller import VoiceActivityController | |
3 | +from whisper_online import * | |
4 | +import numpy as np | |
5 | +import librosa | |
6 | +import io | |
7 | +import soundfile | |
8 | +import sys | |
9 | + | |
10 | + | |
11 | + | |
12 | + | |
13 | +class SimpleASRProcessor: | |
14 | + | |
15 | + def __init__(self, asr, sampling_rate = 16000): | |
16 | + """run this when starting or restarting processing""" | |
17 | + self.audio_buffer = np.array([],dtype=np.float32) | |
18 | + self.prompt_buffer = "" | |
19 | + self.asr = asr | |
20 | + self.sampling_rate = sampling_rate | |
21 | + self.init_prompt = '' | |
22 | + | |
23 | + def ts_words(self, segments): | |
24 | + result = "" | |
25 | + for segment in segments: | |
26 | + if segment.no_speech_prob > 0.9: | |
27 | + continue | |
28 | + for word in segment.words: | |
29 | + w = word.word | |
30 | + t = (word.start, word.end, w) | |
31 | + result +=w | |
32 | + return result | |
33 | + | |
34 | + def stream_process(self, vad_result): | |
35 | + iter_in_phrase = 0 | |
36 | + for chunk, is_final in vad_result: | |
37 | + iter_in_phrase += 1 | |
38 | + | |
39 | + if chunk is not None: | |
40 | + sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") | |
41 | + audio, _ = librosa.load(sf,sr=SAMPLING_RATE) | |
42 | + # self.audio_buffer.append(chunk) | |
43 | + out = [] | |
44 | + out.append(audio) | |
45 | + a = np.concatenate(out) | |
46 | + self.audio_buffer = np.append(self.audio_buffer, a) | |
47 | + | |
48 | + if is_final and len(self.audio_buffer) > 0: | |
49 | + res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) | |
50 | + # use custom ts_words | |
51 | + tsw = self.ts_words(res) | |
52 | + self.init_prompt = self.init_prompt + tsw | |
53 | + self.init_prompt = self.init_prompt [-100:] | |
54 | + self.audio_buffer.resize(0) | |
55 | + iter_in_phrase =0 | |
56 | + yield True, tsw | |
57 | + # show progress evry 10 chunks | |
58 | + elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0: | |
59 | + res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) | |
60 | + # use custom ts_words | |
61 | + tsw = self.ts_words(res) | |
62 | + yield False, tsw | |
63 | + | |
64 | + | |
65 | + | |
66 | + | |
67 | + | |
68 | + | |
69 | + | |
70 | +SAMPLING_RATE = 16000 | |
71 | + | |
72 | +model = "large-v2" | |
73 | +src_lan = "en" # source language | |
74 | +tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used | |
75 | +use_vad_result = True | |
76 | +min_sample_length = 1 * SAMPLING_RATE | |
77 | + | |
78 | + | |
79 | + | |
80 | +vad = VoiceActivityController(use_vad_result = use_vad_result) | |
81 | +asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model | |
82 | + | |
83 | +tokenizer = create_tokenizer(tgt_lan) | |
84 | +online = SimpleASRProcessor(asr) | |
85 | + | |
86 | + | |
87 | +stream = MicrophoneStream() | |
88 | +stream = vad.detect_user_speech(stream, audio_in_int16 = False) | |
89 | +stream = online.stream_process(stream) | |
90 | + | |
91 | +for isFinal, text in stream: | |
92 | + if isFinal: | |
93 | + print( text, end="\r\n") | |
94 | + else: | |
95 | + print( text, end="\r") |
+++ mic_test_whisper_streaming.py
... | ... | @@ -0,0 +1,71 @@ |
1 | +from microphone_stream import MicrophoneStream | |
2 | +from voice_activity_controller import VoiceActivityController | |
3 | +from whisper_online import * | |
4 | +import numpy as np | |
5 | +import librosa | |
6 | +import io | |
7 | +import soundfile | |
8 | +import sys | |
9 | + | |
10 | + | |
11 | +SAMPLING_RATE = 16000 | |
12 | +model = "large-v2" | |
13 | +src_lan = "en" # source language | |
14 | +tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used | |
15 | +use_vad_result = True | |
16 | +min_sample_length = 1 * SAMPLING_RATE | |
17 | + | |
18 | + | |
19 | + | |
20 | +asr = FasterWhisperASR(src_lan, model) # loads and wraps Whisper model | |
21 | +tokenizer = create_tokenizer(tgt_lan) # sentence segmenter for the target language | |
22 | +online = OnlineASRProcessor(asr, tokenizer) # create processing object | |
23 | + | |
24 | +microphone_stream = MicrophoneStream() | |
25 | +vad = VoiceActivityController(use_vad_result = use_vad_result) | |
26 | + | |
27 | +complete_text = '' | |
28 | +final_processing_pending = False | |
29 | +out = [] | |
30 | +out_len = 0 | |
31 | +for iter in vad.detect_user_speech(microphone_stream): # processing loop: | |
32 | + raw_bytes= iter[0] | |
33 | + is_final = iter[1] | |
34 | + | |
35 | + if raw_bytes: | |
36 | + sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") | |
37 | + audio, _ = librosa.load(sf,sr=SAMPLING_RATE) | |
38 | + out.append(audio) | |
39 | + out_len += len(audio) | |
40 | + | |
41 | + | |
42 | + if (is_final or out_len >= min_sample_length) and out_len>0: | |
43 | + a = np.concatenate(out) | |
44 | + online.insert_audio_chunk(a) | |
45 | + | |
46 | + if out_len > min_sample_length: | |
47 | + o = online.process_iter() | |
48 | + print('-----'*10) | |
49 | + complete_text = complete_text + o[2] | |
50 | + print('PARTIAL - '+ complete_text) # do something with current partial output | |
51 | + print('-----'*10) | |
52 | + out = [] | |
53 | + out_len = 0 | |
54 | + | |
55 | + if is_final: | |
56 | + o = online.finish() | |
57 | + online.init() | |
58 | + # final_processing_pending = False | |
59 | + print('-----'*10) | |
60 | + complete_text = complete_text + o[2] | |
61 | + print('FINAL - '+ complete_text) # do something with current partial output | |
62 | + print('-----'*10) | |
63 | + out = [] | |
64 | + out_len = 0 | |
65 | + | |
66 | + | |
67 | + | |
68 | + | |
69 | + | |
70 | + | |
71 | + |
+++ microphone_stream.py
... | ... | @@ -0,0 +1,82 @@ |
1 | + | |
2 | + | |
3 | +### mic stream | |
4 | + | |
5 | +import queue | |
6 | +import re | |
7 | +import sys | |
8 | +import pyaudio | |
9 | + | |
10 | + | |
11 | +class MicrophoneStream: | |
12 | + def __init__( | |
13 | + self, | |
14 | + sample_rate: int = 16000, | |
15 | + ): | |
16 | + """ | |
17 | + Creates a stream of audio from the microphone. | |
18 | + | |
19 | + Args: | |
20 | + chunk_size: The size of each chunk of audio to read from the microphone. | |
21 | + channels: The number of channels to record audio from. | |
22 | + sample_rate: The sample rate to record audio at. | |
23 | + """ | |
24 | + try: | |
25 | + import pyaudio | |
26 | + except ImportError: | |
27 | + raise Exception('py audio not installed') | |
28 | + | |
29 | + self._pyaudio = pyaudio.PyAudio() | |
30 | + self.sample_rate = sample_rate | |
31 | + | |
32 | + self._chunk_size = int(self.sample_rate * 0.1) | |
33 | + self._stream = self._pyaudio.open( | |
34 | + format=pyaudio.paInt16, | |
35 | + channels=1, | |
36 | + rate=sample_rate, | |
37 | + input=True, | |
38 | + frames_per_buffer=self._chunk_size, | |
39 | + ) | |
40 | + | |
41 | + self._open = True | |
42 | + | |
43 | + def __iter__(self): | |
44 | + """ | |
45 | + Returns the iterator object. | |
46 | + """ | |
47 | + | |
48 | + return self | |
49 | + | |
50 | + def __next__(self): | |
51 | + """ | |
52 | + Reads a chunk of audio from the microphone. | |
53 | + """ | |
54 | + if not self._open: | |
55 | + raise StopIteration | |
56 | + | |
57 | + try: | |
58 | + return self._stream.read(self._chunk_size) | |
59 | + except KeyboardInterrupt: | |
60 | + raise StopIteration | |
61 | + | |
62 | + def close(self): | |
63 | + """ | |
64 | + Closes the stream. | |
65 | + """ | |
66 | + | |
67 | + self._open = False | |
68 | + | |
69 | + if self._stream.is_active(): | |
70 | + self._stream.stop_stream() | |
71 | + | |
72 | + self._stream.close() | |
73 | + self._pyaudio.terminate() | |
74 | + | |
75 | + | |
76 | + | |
77 | + | |
78 | + | |
79 | + | |
80 | + | |
81 | + | |
82 | + |
+++ voice_activity_controller.py
... | ... | @@ -0,0 +1,117 @@ |
1 | +import torch | |
2 | +import numpy as np | |
3 | +# import sounddevice as sd | |
4 | +import torch | |
5 | +import numpy as np | |
6 | + | |
7 | + | |
8 | +class VoiceActivityController: | |
9 | + def __init__( | |
10 | + self, | |
11 | + sampling_rate = 16000, | |
12 | + second_ofSilence = 0.5, | |
13 | + second_ofSpeech = 0.25, | |
14 | + second_ofMinRecording = 10, | |
15 | + use_vad_result = True, | |
16 | + activity_detected_callback=None, | |
17 | + ): | |
18 | + self.activity_detected_callback=activity_detected_callback | |
19 | + self.model, self.utils = torch.hub.load( | |
20 | + repo_or_dir='snakers4/silero-vad', | |
21 | + model='silero_vad' | |
22 | + ) | |
23 | + (self.get_speech_timestamps, | |
24 | + save_audio, | |
25 | + read_audio, | |
26 | + VADIterator, | |
27 | + collect_chunks) = self.utils | |
28 | + | |
29 | + self.sampling_rate = sampling_rate | |
30 | + self.silence_limit = second_ofSilence * self.sampling_rate | |
31 | + self.speech_limit = second_ofSpeech *self.sampling_rate | |
32 | + self.MIN_RECORDING_LENGTH = second_ofMinRecording * self.sampling_rate | |
33 | + | |
34 | + self.use_vad_result = use_vad_result | |
35 | + self.vad_iterator = VADIterator( | |
36 | + model =self.model, | |
37 | + threshold = 0.3, | |
38 | + sampling_rate= 16000, | |
39 | + min_silence_duration_ms = 500, #100 | |
40 | + speech_pad_ms = 400 #30 | |
41 | + ) | |
42 | + self.last_marked_chunk = None | |
43 | + | |
44 | + | |
45 | + def int2float(self, sound): | |
46 | + abs_max = np.abs(sound).max() | |
47 | + sound = sound.astype('float32') | |
48 | + if abs_max > 0: | |
49 | + sound *= 1/32768 | |
50 | + sound = sound.squeeze() # depends on the use case | |
51 | + return sound | |
52 | + | |
53 | + def apply_vad(self, audio): | |
54 | + audio_float32 = self.int2float(audio) | |
55 | + chunk = self.vad_iterator(audio_float32, return_seconds=False) | |
56 | + | |
57 | + if chunk is not None: | |
58 | + if "start" in chunk: | |
59 | + start = chunk["start"] | |
60 | + self.last_marked_chunk = chunk | |
61 | + return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0 | |
62 | + | |
63 | + if "end" in chunk: | |
64 | + #todo: pending get the padding from the next chunk | |
65 | + end = chunk["end"] if chunk["end"] < len(audio) else len(audio) | |
66 | + self.last_marked_chunk = chunk | |
67 | + return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end | |
68 | + | |
69 | + if self.last_marked_chunk is not None: | |
70 | + if "start" in self.last_marked_chunk: | |
71 | + return audio, len(audio) ,0 | |
72 | + | |
73 | + if "end" in self.last_marked_chunk: | |
74 | + return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio) | |
75 | + | |
76 | + return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0 | |
77 | + | |
78 | + | |
79 | + | |
80 | + def detect_user_speech(self, audio_stream, audio_in_int16 = False): | |
81 | + silence_len= 0 | |
82 | + speech_len = 0 | |
83 | + | |
84 | + for data in audio_stream: # replace with your condition of choice | |
85 | + # if isinstance(data, EndOfTransmission): | |
86 | + # raise EndOfTransmission("End of transmission detected") | |
87 | + | |
88 | + | |
89 | + audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data | |
90 | + wav = audio_block | |
91 | + | |
92 | + | |
93 | + is_final = False | |
94 | + voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav) | |
95 | + # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}') | |
96 | + | |
97 | + if speech_in_wav > 0 : | |
98 | + silence_len= 0 | |
99 | + speech_len += speech_in_wav | |
100 | + if self.activity_detected_callback is not None: | |
101 | + self.activity_detected_callback() | |
102 | + | |
103 | + silence_len = silence_len + last_silent_duration_in_wav | |
104 | + if silence_len>= self.silence_limit and speech_len >= self.speech_limit: | |
105 | + is_final = True | |
106 | + silence_len= 0 | |
107 | + speech_len = 0 | |
108 | + | |
109 | + | |
110 | + yield voice_audio.tobytes(), is_final | |
111 | + | |
112 | + | |
113 | + | |
114 | + | |
115 | + | |
116 | + | |
117 | + |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?