Commit @5b1edda944f525cf51a567a35d666c97872f2c4c - yjyoon/whisper_streaming

Rodrigo 2023-12-02

vad

@5b1edda944f525cf51a567a35d666c97872f2c4c

5b1edda

mic_test_whisper_simple.py (added)

+++ mic_test_whisper_simple.py

...	...	@@ -0,0 +1,95 @@
	1	+from microphone_stream import MicrophoneStream
	2	+from voice_activity_controller import VoiceActivityController
	3	+from whisper_online import *
	4	+import numpy as np
	5	+import librosa
	6	+import io
	7	+import soundfile
	8	+import sys
	9	+
	10	+
	11	+
	12	+
	13	+class SimpleASRProcessor:
	14	+
	15	+ def __init__(self, asr, sampling_rate = 16000):
	16	+ """run this when starting or restarting processing"""
	17	+ self.audio_buffer = np.array([],dtype=np.float32)
	18	+ self.prompt_buffer = ""
	19	+ self.asr = asr
	20	+ self.sampling_rate = sampling_rate
	21	+ self.init_prompt = ''
	22	+
	23	+ def ts_words(self, segments):
	24	+ result = ""
	25	+ for segment in segments:
	26	+ if segment.no_speech_prob > 0.9:
	27	+ continue
	28	+ for word in segment.words:
	29	+ w = word.word
	30	+ t = (word.start, word.end, w)
	31	+ result +=w
	32	+ return result
	33	+
	34	+ def stream_process(self, vad_result):
	35	+ iter_in_phrase = 0
	36	+ for chunk, is_final in vad_result:
	37	+ iter_in_phrase += 1
	38	+
	39	+ if chunk is not None:
	40	+ sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	41	+ audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
	42	+ # self.audio_buffer.append(chunk)
	43	+ out = []
	44	+ out.append(audio)
	45	+ a = np.concatenate(out)
	46	+ self.audio_buffer = np.append(self.audio_buffer, a)
	47	+
	48	+ if is_final and len(self.audio_buffer) > 0:
	49	+ res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
	50	+ # use custom ts_words
	51	+ tsw = self.ts_words(res)
	52	+ self.init_prompt = self.init_prompt + tsw
	53	+ self.init_prompt = self.init_prompt [-100:]
	54	+ self.audio_buffer.resize(0)
	55	+ iter_in_phrase =0
	56	+ yield True, tsw
	57	+ # show progress evry 10 chunks
	58	+ elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
	59	+ res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
	60	+ # use custom ts_words
	61	+ tsw = self.ts_words(res)
	62	+ yield False, tsw
	63	+
	64	+
	65	+
	66	+
	67	+
	68	+
	69	+
	70	+SAMPLING_RATE = 16000
	71	+
	72	+model = "large-v2"
	73	+src_lan = "en" # source language
	74	+tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
	75	+use_vad_result = True
	76	+min_sample_length = 1 * SAMPLING_RATE
	77	+
	78	+
	79	+
	80	+vad = VoiceActivityController(use_vad_result = use_vad_result)
	81	+asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
	82	+
	83	+tokenizer = create_tokenizer(tgt_lan)
	84	+online = SimpleASRProcessor(asr)
	85	+
	86	+
	87	+stream = MicrophoneStream()
	88	+stream = vad.detect_user_speech(stream, audio_in_int16 = False)
	89	+stream = online.stream_process(stream)
	90	+
	91	+for isFinal, text in stream:
	92	+ if isFinal:
	93	+ print( text, end="\r\n")
	94	+ else:
	95	+ print( text, end="\r")

5b1edda

mic_test_whisper_streaming.py (added)

+++ mic_test_whisper_streaming.py

...	...	@@ -0,0 +1,71 @@
	1	+from microphone_stream import MicrophoneStream
	2	+from voice_activity_controller import VoiceActivityController
	3	+from whisper_online import *
	4	+import numpy as np
	5	+import librosa
	6	+import io
	7	+import soundfile
	8	+import sys
	9	+
	10	+
	11	+SAMPLING_RATE = 16000
	12	+model = "large-v2"
	13	+src_lan = "en" # source language
	14	+tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
	15	+use_vad_result = True
	16	+min_sample_length = 1 * SAMPLING_RATE
	17	+
	18	+
	19	+
	20	+asr = FasterWhisperASR(src_lan, model) # loads and wraps Whisper model
	21	+tokenizer = create_tokenizer(tgt_lan) # sentence segmenter for the target language
	22	+online = OnlineASRProcessor(asr, tokenizer) # create processing object
	23	+
	24	+microphone_stream = MicrophoneStream()
	25	+vad = VoiceActivityController(use_vad_result = use_vad_result)
	26	+
	27	+complete_text = ''
	28	+final_processing_pending = False
	29	+out = []
	30	+out_len = 0
	31	+for iter in vad.detect_user_speech(microphone_stream): # processing loop:
	32	+ raw_bytes= iter[0]
	33	+ is_final = iter[1]
	34	+
	35	+ if raw_bytes:
	36	+ sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	37	+ audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
	38	+ out.append(audio)
	39	+ out_len += len(audio)
	40	+
	41	+
	42	+ if (is_final or out_len >= min_sample_length) and out_len>0:
	43	+ a = np.concatenate(out)
	44	+ online.insert_audio_chunk(a)
	45	+
	46	+ if out_len > min_sample_length:
	47	+ o = online.process_iter()
	48	+ print('-----'*10)
	49	+ complete_text = complete_text + o[2]
	50	+ print('PARTIAL - '+ complete_text) # do something with current partial output
	51	+ print('-----'*10)
	52	+ out = []
	53	+ out_len = 0
	54	+
	55	+ if is_final:
	56	+ o = online.finish()
	57	+ online.init()
	58	+ # final_processing_pending = False
	59	+ print('-----'*10)
	60	+ complete_text = complete_text + o[2]
	61	+ print('FINAL - '+ complete_text) # do something with current partial output
	62	+ print('-----'*10)
	63	+ out = []
	64	+ out_len = 0
	65	+
	66	+
	67	+
	68	+
	69	+
	70	+
	71	+

5b1edda

microphone_stream.py (added)

+++ microphone_stream.py

...	...	@@ -0,0 +1,82 @@
	1	+
	2	+
	3	+### mic stream
	4	+
	5	+import queue
	6	+import re
	7	+import sys
	8	+import pyaudio
	9	+
	10	+
	11	+class MicrophoneStream:
	12	+ def __init__(
	13	+ self,
	14	+ sample_rate: int = 16000,
	15	+ ):
	16	+ """
	17	+ Creates a stream of audio from the microphone.
	18	+
	19	+ Args:
	20	+ chunk_size: The size of each chunk of audio to read from the microphone.
	21	+ channels: The number of channels to record audio from.
	22	+ sample_rate: The sample rate to record audio at.
	23	+ """
	24	+ try:
	25	+ import pyaudio
	26	+ except ImportError:
	27	+ raise Exception('py audio not installed')
	28	+
	29	+ self._pyaudio = pyaudio.PyAudio()
	30	+ self.sample_rate = sample_rate
	31	+
	32	+ self._chunk_size = int(self.sample_rate * 0.1)
	33	+ self._stream = self._pyaudio.open(
	34	+ format=pyaudio.paInt16,
	35	+ channels=1,
	36	+ rate=sample_rate,
	37	+ input=True,
	38	+ frames_per_buffer=self._chunk_size,
	39	+ )
	40	+
	41	+ self._open = True
	42	+
	43	+ def __iter__(self):
	44	+ """
	45	+ Returns the iterator object.
	46	+ """
	47	+
	48	+ return self
	49	+
	50	+ def __next__(self):
	51	+ """
	52	+ Reads a chunk of audio from the microphone.
	53	+ """
	54	+ if not self._open:
	55	+ raise StopIteration
	56	+
	57	+ try:
	58	+ return self._stream.read(self._chunk_size)
	59	+ except KeyboardInterrupt:
	60	+ raise StopIteration
	61	+
	62	+ def close(self):
	63	+ """
	64	+ Closes the stream.
	65	+ """
	66	+
	67	+ self._open = False
	68	+
	69	+ if self._stream.is_active():
	70	+ self._stream.stop_stream()
	71	+
	72	+ self._stream.close()
	73	+ self._pyaudio.terminate()
	74	+
	75	+
	76	+
	77	+
	78	+
	79	+
	80	+
	81	+
	82	+

5b1edda

voice_activity_controller.py (added)

+++ voice_activity_controller.py

...	...	@@ -0,0 +1,117 @@
	1	+import torch
	2	+import numpy as np
	3	+# import sounddevice as sd
	4	+import torch
	5	+import numpy as np
	6	+
	7	+
	8	+class VoiceActivityController:
	9	+ def __init__(
	10	+ self,
	11	+ sampling_rate = 16000,
	12	+ second_ofSilence = 0.5,
	13	+ second_ofSpeech = 0.25,
	14	+ second_ofMinRecording = 10,
	15	+ use_vad_result = True,
	16	+ activity_detected_callback=None,
	17	+ ):
	18	+ self.activity_detected_callback=activity_detected_callback
	19	+ self.model, self.utils = torch.hub.load(
	20	+ repo_or_dir='snakers4/silero-vad',
	21	+ model='silero_vad'
	22	+ )
	23	+ (self.get_speech_timestamps,
	24	+ save_audio,
	25	+ read_audio,
	26	+ VADIterator,
	27	+ collect_chunks) = self.utils
	28	+
	29	+ self.sampling_rate = sampling_rate
	30	+ self.silence_limit = second_ofSilence * self.sampling_rate
	31	+ self.speech_limit = second_ofSpeech *self.sampling_rate
	32	+ self.MIN_RECORDING_LENGTH = second_ofMinRecording * self.sampling_rate
	33	+
	34	+ self.use_vad_result = use_vad_result
	35	+ self.vad_iterator = VADIterator(
	36	+ model =self.model,
	37	+ threshold = 0.3,
	38	+ sampling_rate= 16000,
	39	+ min_silence_duration_ms = 500, #100
	40	+ speech_pad_ms = 400 #30
	41	+ )
	42	+ self.last_marked_chunk = None
	43	+
	44	+
	45	+ def int2float(self, sound):
	46	+ abs_max = np.abs(sound).max()
	47	+ sound = sound.astype('float32')
	48	+ if abs_max > 0:
	49	+ sound *= 1/32768
	50	+ sound = sound.squeeze() # depends on the use case
	51	+ return sound
	52	+
	53	+ def apply_vad(self, audio):
	54	+ audio_float32 = self.int2float(audio)
	55	+ chunk = self.vad_iterator(audio_float32, return_seconds=False)
	56	+
	57	+ if chunk is not None:
	58	+ if "start" in chunk:
	59	+ start = chunk["start"]
	60	+ self.last_marked_chunk = chunk
	61	+ return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
	62	+
	63	+ if "end" in chunk:
	64	+ #todo: pending get the padding from the next chunk
	65	+ end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
	66	+ self.last_marked_chunk = chunk
	67	+ return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
	68	+
	69	+ if self.last_marked_chunk is not None:
	70	+ if "start" in self.last_marked_chunk:
	71	+ return audio, len(audio) ,0
	72	+
	73	+ if "end" in self.last_marked_chunk:
	74	+ return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio)
	75	+
	76	+ return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0
	77	+
	78	+
	79	+
	80	+ def detect_user_speech(self, audio_stream, audio_in_int16 = False):
	81	+ silence_len= 0
	82	+ speech_len = 0
	83	+
	84	+ for data in audio_stream: # replace with your condition of choice
	85	+ # if isinstance(data, EndOfTransmission):
	86	+ # raise EndOfTransmission("End of transmission detected")
	87	+
	88	+
	89	+ audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
	90	+ wav = audio_block
	91	+
	92	+
	93	+ is_final = False
	94	+ voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
	95	+ # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
	96	+
	97	+ if speech_in_wav > 0 :
	98	+ silence_len= 0
	99	+ speech_len += speech_in_wav
	100	+ if self.activity_detected_callback is not None:
	101	+ self.activity_detected_callback()
	102	+
	103	+ silence_len = silence_len + last_silent_duration_in_wav
	104	+ if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
	105	+ is_final = True
	106	+ silence_len= 0
	107	+ speech_len = 0
	108	+
	109	+
	110	+ yield voice_audio.tobytes(), is_final
	111	+
	112	+
	113	+
	114	+
	115	+
	116	+
	117	+

Add a comment

Open 0
Closed 0

List

Delete comment