Commit @ecee2fbef721857dd270d51988135390c5291fcb - yjyoon/whisper_streaming

Dominik Macháček 2024-01-03

Merge remote-tracking branch 'rodrigo/main' into vad-streaming

@ecee2fbef721857dd270d51988135390c5291fcb

ecee2fb

mic_test_whisper_simple.py (added)

+++ mic_test_whisper_simple.py

...	...	@@ -0,0 +1,95 @@
	1	+from microphone_stream import MicrophoneStream
	2	+from voice_activity_controller import VoiceActivityController
	3	+from whisper_online import *
	4	+import numpy as np
	5	+import librosa
	6	+import io
	7	+import soundfile
	8	+import sys
	9	+
	10	+
	11	+
	12	+
	13	+class SimpleASRProcessor:
	14	+
	15	+ def __init__(self, asr, sampling_rate = 16000):
	16	+ """run this when starting or restarting processing"""
	17	+ self.audio_buffer = np.array([],dtype=np.float32)
	18	+ self.prompt_buffer = ""
	19	+ self.asr = asr
	20	+ self.sampling_rate = sampling_rate
	21	+ self.init_prompt = ''
	22	+
	23	+ def ts_words(self, segments):
	24	+ result = ""
	25	+ for segment in segments:
	26	+ if segment.no_speech_prob > 0.9:
	27	+ continue
	28	+ for word in segment.words:
	29	+ w = word.word
	30	+ t = (word.start, word.end, w)
	31	+ result +=w
	32	+ return result
	33	+
	34	+ def stream_process(self, vad_result):
	35	+ iter_in_phrase = 0
	36	+ for chunk, is_final in vad_result:
	37	+ iter_in_phrase += 1
	38	+
	39	+ if chunk is not None:
	40	+ sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	41	+ audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
	42	+ out = []
	43	+ out.append(audio)
	44	+ a = np.concatenate(out)
	45	+ self.audio_buffer = np.append(self.audio_buffer, a)
	46	+
	47	+ if is_final and len(self.audio_buffer) > 0:
	48	+ res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
	49	+ tsw = self.ts_words(res)
	50	+
	51	+ self.init_prompt = self.init_prompt + tsw
	52	+ self.init_prompt = self.init_prompt [-100:]
	53	+ self.audio_buffer.resize(0)
	54	+ iter_in_phrase =0
	55	+
	56	+ yield True, tsw
	57	+ # show progress evry 50 chunks
	58	+ elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
	59	+ res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
	60	+ # use custom ts_words
	61	+ tsw = self.ts_words(res)
	62	+ yield False, tsw
	63	+
	64	+
	65	+
	66	+
	67	+
	68	+
	69	+
	70	+SAMPLING_RATE = 16000
	71	+
	72	+model = "large-v2"
	73	+src_lan = "en" # source language
	74	+tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
	75	+use_vad = False
	76	+min_sample_length = 1 * SAMPLING_RATE
	77	+
	78	+
	79	+
	80	+vac = VoiceActivityController(use_vad_result = use_vad)
	81	+asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
	82	+
	83	+tokenizer = create_tokenizer(tgt_lan)
	84	+online = SimpleASRProcessor(asr)
	85	+
	86	+
	87	+stream = MicrophoneStream()
	88	+stream = vac.detect_user_speech(stream, audio_in_int16 = False)
	89	+stream = online.stream_process(stream)
	90	+
	91	+for isFinal, text in stream:
	92	+ if isFinal:
	93	+ print( text, end="\r\n")
	94	+ else:
	95	+ print( text, end="\r")

ecee2fb

mic_test_whisper_streaming.py (added)

+++ mic_test_whisper_streaming.py

...	...	@@ -0,0 +1,71 @@
	1	+from microphone_stream import MicrophoneStream
	2	+from voice_activity_controller import VoiceActivityController
	3	+from whisper_online import *
	4	+import numpy as np
	5	+import librosa
	6	+import io
	7	+import soundfile
	8	+import sys
	9	+
	10	+
	11	+SAMPLING_RATE = 16000
	12	+model = "large-v2"
	13	+src_lan = "en" # source language
	14	+tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
	15	+use_vad_result = True
	16	+min_sample_length = 1 * SAMPLING_RATE
	17	+
	18	+
	19	+
	20	+asr = FasterWhisperASR(src_lan, model) # loads and wraps Whisper model
	21	+tokenizer = create_tokenizer(tgt_lan) # sentence segmenter for the target language
	22	+online = OnlineASRProcessor(asr, tokenizer) # create processing object
	23	+
	24	+microphone_stream = MicrophoneStream()
	25	+vad = VoiceActivityController(use_vad_result = use_vad_result)
	26	+
	27	+complete_text = ''
	28	+final_processing_pending = False
	29	+out = []
	30	+out_len = 0
	31	+for iter in vad.detect_user_speech(microphone_stream): # processing loop:
	32	+ raw_bytes= iter[0]
	33	+ is_final = iter[1]
	34	+
	35	+ if raw_bytes:
	36	+ sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	37	+ audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
	38	+ out.append(audio)
	39	+ out_len += len(audio)
	40	+
	41	+
	42	+ if (is_final or out_len >= min_sample_length) and out_len>0:
	43	+ a = np.concatenate(out)
	44	+ online.insert_audio_chunk(a)
	45	+
	46	+ if out_len > min_sample_length:
	47	+ o = online.process_iter()
	48	+ print('-----'*10)
	49	+ complete_text = complete_text + o[2]
	50	+ print('PARTIAL - '+ complete_text) # do something with current partial output
	51	+ print('-----'*10)
	52	+ out = []
	53	+ out_len = 0
	54	+
	55	+ if is_final:
	56	+ o = online.finish()
	57	+ # final_processing_pending = False
	58	+ print('-----'*10)
	59	+ complete_text = complete_text + o[2]
	60	+ print('FINAL - '+ complete_text) # do something with current partial output
	61	+ print('-----'*10)
	62	+ online.init()
	63	+ out = []
	64	+ out_len = 0
	65	+
	66	+
	67	+
	68	+
	69	+
	70	+
	71	+

ecee2fb

microphone_stream.py (added)

+++ microphone_stream.py

...	...	@@ -0,0 +1,82 @@
	1	+
	2	+
	3	+### mic stream
	4	+
	5	+import queue
	6	+import re
	7	+import sys
	8	+import pyaudio
	9	+
	10	+
	11	+class MicrophoneStream:
	12	+ def __init__(
	13	+ self,
	14	+ sample_rate: int = 16000,
	15	+ ):
	16	+ """
	17	+ Creates a stream of audio from the microphone.
	18	+
	19	+ Args:
	20	+ chunk_size: The size of each chunk of audio to read from the microphone.
	21	+ channels: The number of channels to record audio from.
	22	+ sample_rate: The sample rate to record audio at.
	23	+ """
	24	+ try:
	25	+ import pyaudio
	26	+ except ImportError:
	27	+ raise Exception('py audio not installed')
	28	+
	29	+ self._pyaudio = pyaudio.PyAudio()
	30	+ self.sample_rate = sample_rate
	31	+
	32	+ self._chunk_size = int(self.sample_rate * 40 / 1000)
	33	+ self._stream = self._pyaudio.open(
	34	+ format=pyaudio.paInt16,
	35	+ channels=1,
	36	+ rate=sample_rate,
	37	+ input=True,
	38	+ frames_per_buffer=self._chunk_size,
	39	+ )
	40	+
	41	+ self._open = True
	42	+
	43	+ def __iter__(self):
	44	+ """
	45	+ Returns the iterator object.
	46	+ """
	47	+
	48	+ return self
	49	+
	50	+ def __next__(self):
	51	+ """
	52	+ Reads a chunk of audio from the microphone.
	53	+ """
	54	+ if not self._open:
	55	+ raise StopIteration
	56	+
	57	+ try:
	58	+ return self._stream.read(self._chunk_size)
	59	+ except KeyboardInterrupt:
	60	+ raise StopIteration
	61	+
	62	+ def close(self):
	63	+ """
	64	+ Closes the stream.
	65	+ """
	66	+
	67	+ self._open = False
	68	+
	69	+ if self._stream.is_active():
	70	+ self._stream.stop_stream()
	71	+
	72	+ self._stream.close()
	73	+ self._pyaudio.terminate()
	74	+
	75	+
	76	+
	77	+
	78	+
	79	+
	80	+
	81	+
	82	+

ecee2fb

voice_activity_controller.py (added)

+++ voice_activity_controller.py

...	...	@@ -0,0 +1,119 @@
	1	+import torch
	2	+import numpy as np
	3	+# import sounddevice as sd
	4	+import torch
	5	+import numpy as np
	6	+import datetime
	7	+
	8	+
	9	+def int2float(sound):
	10	+ abs_max = np.abs(sound).max()
	11	+ sound = sound.astype('float32')
	12	+ if abs_max > 0:
	13	+ sound *= 1/32768
	14	+ sound = sound.squeeze() # depends on the use case
	15	+ return sound
	16	+
	17	+class VoiceActivityController:
	18	+ def __init__(
	19	+ self,
	20	+ sampling_rate = 16000,
	21	+ min_silence_to_final_ms = 500,
	22	+ min_speech_to_final_ms = 100,
	23	+ min_silence_duration_ms = 100,
	24	+ use_vad_result = True,
	25	+ activity_detected_callback=None,
	26	+ threshold =0.3
	27	+ ):
	28	+ self.activity_detected_callback=activity_detected_callback
	29	+ self.model, self.utils = torch.hub.load(
	30	+ repo_or_dir='snakers4/silero-vad',
	31	+ model='silero_vad'
	32	+ )
	33	+ # (self.get_speech_timestamps,
	34	+ # save_audio,
	35	+ # read_audio,
	36	+ # VADIterator,
	37	+ # collect_chunks) = self.utils
	38	+
	39	+ self.sampling_rate = sampling_rate
	40	+ self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000
	41	+ self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
	42	+ self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
	43	+
	44	+ self.use_vad_result = use_vad_result
	45	+ self.last_marked_chunk = None
	46	+ self.threshold = threshold
	47	+ self.reset_states()
	48	+
	49	+ def reset_states(self):
	50	+ self.model.reset_states()
	51	+ self.temp_end = 0
	52	+ self.current_sample = 0
	53	+
	54	+ def apply_vad(self, audio):
	55	+ x = int2float(audio)
	56	+ if not torch.is_tensor(x):
	57	+ try:
	58	+ x = torch.Tensor(x)
	59	+ except:
	60	+ raise TypeError("Audio cannot be casted to tensor. Cast it manually")
	61	+
	62	+ speech_prob = self.model(x, self.sampling_rate).item()
	63	+
	64	+ window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
	65	+ self.current_sample += window_size_samples
	66	+
	67	+
	68	+ if (speech_prob >= self.threshold):
	69	+ self.temp_end = 0
	70	+ return audio, window_size_samples, 0
	71	+
	72	+ else :
	73	+ if not self.temp_end:
	74	+ self.temp_end = self.current_sample
	75	+
	76	+ if self.current_sample - self.temp_end < self.min_silence_samples:
	77	+ return audio, 0, window_size_samples
	78	+ else:
	79	+ return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples
	80	+
	81	+
	82	+
	83	+
	84	+
	85	+ def detect_user_speech(self, audio_stream, audio_in_int16 = False):
	86	+ last_silence_len= 0
	87	+ speech_len = 0
	88	+
	89	+ for data in audio_stream: # replace with your condition of choice
	90	+
	91	+
	92	+ audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
	93	+ wav = audio_block
	94	+
	95	+ is_final = False
	96	+ voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
	97	+
	98	+
	99	+ if speech_in_wav > 0 :
	100	+ last_silence_len= 0
	101	+ speech_len += speech_in_wav
	102	+ if self.activity_detected_callback is not None:
	103	+ self.activity_detected_callback()
	104	+
	105	+ last_silence_len += last_silent_in_wav
	106	+ if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
	107	+
	108	+ is_final = True
	109	+ last_silence_len= 0
	110	+ speech_len = 0
	111	+
	112	+ yield voice_audio.tobytes(), is_final
	113	+
	114	+
	115	+
	116	+
	117	+
	118	+
	119	+

c27a20e

ecee2fb

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


 import librosa  
 from functools import lru_cache
 import time
-
+import datetime
 
 
 @lru_cache

         return model
 
     def transcribe(self, audio, init_prompt=""):
+
+        # tiempo_inicio = datetime.datetime.now()
         # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
         segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
+        
+        # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe  take { (datetime.datetime.now() -tiempo_inicio)  } ms.')
+
         return list(segments)
 
     def ts_words(self, segments):
         o = []
         for segment in segments:
             for word in segment.words:
+                if segment.no_speech_prob > 0.9:
+                    continue
                 # not stripping the spaces -- should not be merged with them!
                 w = word.word
                 t = (word.start, word.end, w)

Add a comment

Open 0
Closed 0

List

...	...	@@ -4,7 +4,7 @@
4	4	import librosa
5	5	from functools import lru_cache
6	6	import time
7		-
	7	+import datetime
8	8
9	9
10	10	@lru_cache
...	...	@@ -118,14 +118,21 @@
118	118	return model
119	119
120	120	def transcribe(self, audio, init_prompt=""):
	121	+
	122	+ # tiempo_inicio = datetime.datetime.now()
121	123	# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
122	124	segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
	125	+
	126	+ # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe take { (datetime.datetime.now() -tiempo_inicio) } ms.')
	127	+
123	128	return list(segments)
124	129
125	130	def ts_words(self, segments):
126	131	o = []
127	132	for segment in segments:
128	133	for word in segment.words:
	134	+ if segment.no_speech_prob > 0.9:
	135	+ continue
129	136	# not stripping the spaces -- should not be merged with them!
130	137	w = word.word
131	138	t = (word.start, word.end, w)

Delete comment