Commit @74b80e376624506dc4625dc413458d2d1abaea4a - yjyoon/whisper_streaming

Dominik Macháček 2024-08-19

remove mic test and streams

@74b80e376624506dc4625dc413458d2d1abaea4a

fdd3091

mic_test_whisper_simple.py (deleted)

--- mic_test_whisper_simple.py

...	...	@@ -1,95 +0,0 @@
	1	-from microphone_stream import MicrophoneStream
	2	-from voice_activity_controller import VoiceActivityController
	3	-from whisper_online import *
	4	-import numpy as np
	5	-import librosa
	6	-import io
	7	-import soundfile
	8	-import sys
	9	-
	10	-
	11	-
	12	-
	13	-class SimpleASRProcessor:
	14	-
	15	- def __init__(self, asr, sampling_rate = 16000):
	16	- """run this when starting or restarting processing"""
	17	- self.audio_buffer = np.array([],dtype=np.float32)
	18	- self.prompt_buffer = ""
	19	- self.asr = asr
	20	- self.sampling_rate = sampling_rate
	21	- self.init_prompt = ''
	22	-
	23	- def ts_words(self, segments):
	24	- result = ""
	25	- for segment in segments:
	26	- if segment.no_speech_prob > 0.9:
	27	- continue
	28	- for word in segment.words:
	29	- w = word.word
	30	- t = (word.start, word.end, w)
	31	- result +=w
	32	- return result
	33	-
	34	- def stream_process(self, vad_result):
	35	- iter_in_phrase = 0
	36	- for chunk, is_final in vad_result:
	37	- iter_in_phrase += 1
	38	-
	39	- if chunk is not None:
	40	- sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	41	- audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
	42	- out = []
	43	- out.append(audio)
	44	- a = np.concatenate(out)
	45	- self.audio_buffer = np.append(self.audio_buffer, a)
	46	-
	47	- if is_final and len(self.audio_buffer) > 0:
	48	- res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
	49	- tsw = self.ts_words(res)
	50	-
	51	- self.init_prompt = self.init_prompt + tsw
	52	- self.init_prompt = self.init_prompt [-100:]
	53	- self.audio_buffer.resize(0)
	54	- iter_in_phrase =0
	55	-
	56	- yield True, tsw
	57	- # show progress evry 50 chunks
	58	- elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
	59	- res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
	60	- # use custom ts_words
	61	- tsw = self.ts_words(res)
	62	- yield False, tsw
	63	-
	64	-
	65	-
	66	-
	67	-
	68	-
	69	-
	70	-SAMPLING_RATE = 16000
	71	-
	72	-model = "large-v2"
	73	-src_lan = "en" # source language
	74	-tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
	75	-use_vad = False
	76	-min_sample_length = 1 * SAMPLING_RATE
	77	-
	78	-
	79	-
	80	-vac = VoiceActivityController(use_vad_result = use_vad)
	81	-asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
	82	-
	83	-tokenizer = create_tokenizer(tgt_lan)
	84	-online = SimpleASRProcessor(asr)
	85	-
	86	-
	87	-stream = MicrophoneStream()
	88	-stream = vac.detect_user_speech(stream, audio_in_int16 = False)
	89	-stream = online.stream_process(stream)
	90	-
	91	-for isFinal, text in stream:
	92	- if isFinal:
	93	- print( text, end="\r\n")
	94	- else:
	95	- print( text, end="\r")

fdd3091

mic_test_whisper_streaming.py (deleted)

--- mic_test_whisper_streaming.py

...	...	@@ -1,71 +0,0 @@
	1	-from microphone_stream import MicrophoneStream
	2	-from voice_activity_controller import VoiceActivityController
	3	-from whisper_online import *
	4	-import numpy as np
	5	-import librosa
	6	-import io
	7	-import soundfile
	8	-import sys
	9	-
	10	-
	11	-SAMPLING_RATE = 16000
	12	-model = "large-v2"
	13	-src_lan = "en" # source language
	14	-tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
	15	-use_vad_result = True
	16	-min_sample_length = 1 * SAMPLING_RATE
	17	-
	18	-
	19	-
	20	-asr = FasterWhisperASR(src_lan, model) # loads and wraps Whisper model
	21	-tokenizer = create_tokenizer(tgt_lan) # sentence segmenter for the target language
	22	-online = OnlineASRProcessor(asr, tokenizer) # create processing object
	23	-
	24	-microphone_stream = MicrophoneStream()
	25	-vad = VoiceActivityController(use_vad_result = use_vad_result)
	26	-
	27	-complete_text = ''
	28	-final_processing_pending = False
	29	-out = []
	30	-out_len = 0
	31	-for iter in vad.detect_user_speech(microphone_stream): # processing loop:
	32	- raw_bytes= iter[0]
	33	- is_final = iter[1]
	34	-
	35	- if raw_bytes:
	36	- sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	37	- audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
	38	- out.append(audio)
	39	- out_len += len(audio)
	40	-
	41	-
	42	- if (is_final or out_len >= min_sample_length) and out_len>0:
	43	- a = np.concatenate(out)
	44	- online.insert_audio_chunk(a)
	45	-
	46	- if out_len > min_sample_length:
	47	- o = online.process_iter()
	48	- print('-----'*10)
	49	- complete_text = complete_text + o[2]
	50	- print('PARTIAL - '+ complete_text) # do something with current partial output
	51	- print('-----'*10)
	52	- out = []
	53	- out_len = 0
	54	-
	55	- if is_final:
	56	- o = online.finish()
	57	- # final_processing_pending = False
	58	- print('-----'*10)
	59	- complete_text = complete_text + o[2]
	60	- print('FINAL - '+ complete_text) # do something with current partial output
	61	- print('-----'*10)
	62	- online.init()
	63	- out = []
	64	- out_len = 0
	65	-
	66	-
	67	-
	68	-
	69	-
	70	-
	71	-

fdd3091

microphone_stream.py (deleted)

--- microphone_stream.py

...	...	@@ -1,82 +0,0 @@
	1	-
	2	-
	3	-### mic stream
	4	-
	5	-import queue
	6	-import re
	7	-import sys
	8	-import pyaudio
	9	-
	10	-
	11	-class MicrophoneStream:
	12	- def __init__(
	13	- self,
	14	- sample_rate: int = 16000,
	15	- ):
	16	- """
	17	- Creates a stream of audio from the microphone.
	18	-
	19	- Args:
	20	- chunk_size: The size of each chunk of audio to read from the microphone.
	21	- channels: The number of channels to record audio from.
	22	- sample_rate: The sample rate to record audio at.
	23	- """
	24	- try:
	25	- import pyaudio
	26	- except ImportError:
	27	- raise Exception('py audio not installed')
	28	-
	29	- self._pyaudio = pyaudio.PyAudio()
	30	- self.sample_rate = sample_rate
	31	-
	32	- self._chunk_size = int(self.sample_rate * 40 / 1000)
	33	- self._stream = self._pyaudio.open(
	34	- format=pyaudio.paInt16,
	35	- channels=1,
	36	- rate=sample_rate,
	37	- input=True,
	38	- frames_per_buffer=self._chunk_size,
	39	- )
	40	-
	41	- self._open = True
	42	-
	43	- def __iter__(self):
	44	- """
	45	- Returns the iterator object.
	46	- """
	47	-
	48	- return self
	49	-
	50	- def __next__(self):
	51	- """
	52	- Reads a chunk of audio from the microphone.
	53	- """
	54	- if not self._open:
	55	- raise StopIteration
	56	-
	57	- try:
	58	- return self._stream.read(self._chunk_size)
	59	- except KeyboardInterrupt:
	60	- raise StopIteration
	61	-
	62	- def close(self):
	63	- """
	64	- Closes the stream.
	65	- """
	66	-
	67	- self._open = False
	68	-
	69	- if self._stream.is_active():
	70	- self._stream.stop_stream()
	71	-
	72	- self._stream.close()
	73	- self._pyaudio.terminate()
	74	-
	75	-
	76	-
	77	-
	78	-
	79	-
	80	-
	81	-
	82	-

Add a comment

Open 0
Closed 0

List

Delete comment