

VAC controller integrated
it works. Reproducing #39
@dfc862b3bc2341b18b3ce7c78cb2d18e59744e29
--- voice_activity_controller.py
+++ voice_activity_controller.py
... | ... | @@ -51,8 +51,12 @@ |
51 | 51 |
self.temp_end = 0 |
52 | 52 |
self.current_sample = 0 |
53 | 53 |
|
54 |
+ self.last_silence_len= 0 |
|
55 |
+ self.speech_len = 0 |
|
56 |
+ |
|
54 | 57 |
def apply_vad(self, audio): |
55 |
- x = int2float(audio) |
|
58 |
+# x = int2float(audio) |
|
59 |
+ x = audio |
|
56 | 60 |
if not torch.is_tensor(x): |
57 | 61 |
try: |
58 | 62 |
x = torch.Tensor(x) |
... | ... | @@ -79,38 +83,42 @@ |
79 | 83 |
return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples |
80 | 84 |
|
81 | 85 |
|
86 |
+ def detect_speech_iter(self, data, audio_in_int16 = False): |
|
87 |
+# audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data |
|
88 |
+ audio_block = data |
|
89 |
+ wav = audio_block |
|
90 |
+ |
|
91 |
+ print(wav, len(wav), type(wav), wav.dtype) |
|
92 |
+ |
|
93 |
+ is_final = False |
|
94 |
+ voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav) |
|
95 |
+ |
|
96 |
+ |
|
97 |
+ if speech_in_wav > 0 : |
|
98 |
+ self.last_silence_len= 0 |
|
99 |
+ self.speech_len += speech_in_wav |
|
100 |
+# if self.activity_detected_callback is not None: |
|
101 |
+# self.activity_detected_callback() |
|
102 |
+ |
|
103 |
+ self.last_silence_len += last_silent_in_wav |
|
104 |
+ if self.last_silence_len>= self.final_silence_limit and self.speech_len >= self.final_speech_limit: |
|
105 |
+ |
|
106 |
+ is_final = True |
|
107 |
+ self.last_silence_len= 0 |
|
108 |
+ self.speech_len = 0 |
|
109 |
+ |
|
110 |
+# return voice_audio.tobytes(), is_final |
|
111 |
+ return voice_audio, is_final |
|
82 | 112 |
|
83 | 113 |
|
84 | 114 |
|
85 | 115 |
def detect_user_speech(self, audio_stream, audio_in_int16 = False): |
86 |
- last_silence_len= 0 |
|
87 |
- speech_len = 0 |
|
116 |
+ self.last_silence_len= 0 |
|
117 |
+ self.speech_len = 0 |
|
88 | 118 |
|
89 | 119 |
for data in audio_stream: # replace with your condition of choice |
90 |
- |
|
91 |
- |
|
92 |
- audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data |
|
93 |
- wav = audio_block |
|
94 |
- |
|
95 |
- is_final = False |
|
96 |
- voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav) |
|
97 |
- |
|
98 |
- |
|
99 |
- if speech_in_wav > 0 : |
|
100 |
- last_silence_len= 0 |
|
101 |
- speech_len += speech_in_wav |
|
102 |
- if self.activity_detected_callback is not None: |
|
103 |
- self.activity_detected_callback() |
|
104 |
- |
|
105 |
- last_silence_len += last_silent_in_wav |
|
106 |
- if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit: |
|
107 |
- |
|
108 |
- is_final = True |
|
109 |
- last_silence_len= 0 |
|
110 |
- speech_len = 0 |
|
111 |
- |
|
112 |
- yield voice_audio.tobytes(), is_final |
|
113 |
- |
|
120 |
+ yield self.detect_speech_iter(data, audio_in_int16) |
|
121 |
+ |
|
114 | 122 |
|
115 | 123 |
|
116 | 124 |
|
+++ whisper_online_vac.py
... | ... | @@ -0,0 +1,209 @@ |
1 | +from whisper_online import * | |
2 | +from voice_activity_controller import * | |
3 | +import soundfile | |
4 | +import io | |
5 | + | |
6 | +SAMPLING_RATE = 16000 | |
7 | + | |
8 | +class VACOnlineASRProcessor(OnlineASRProcessor): | |
9 | + | |
10 | + def __init__(self, *a, **kw): | |
11 | + self.online = OnlineASRProcessor(*a, **kw) | |
12 | + self.vac = VoiceActivityController(use_vad_result = True) | |
13 | + | |
14 | + self.is_currently_final = False | |
15 | + self.logfile = self.online.logfile | |
16 | + | |
17 | + #self.vac_buffer = io.BytesIO() | |
18 | + #self.vac_stream = self.vac.detect_user_speech(self.vac_buffer, audio_in_int16=False) | |
19 | + | |
20 | + self.audio_log = open("audio_log.wav","wb") | |
21 | + | |
22 | + def init(self): | |
23 | + self.online.init() | |
24 | + self.vac.reset_states() | |
25 | + | |
26 | + def insert_audio_chunk(self, audio): | |
27 | + print(audio, len(audio), type(audio), audio.dtype) | |
28 | + r = self.vac.detect_speech_iter(audio,audio_in_int16=False) | |
29 | + raw_bytes, is_final = r | |
30 | + print("is_final",is_final) | |
31 | + print("raw_bytes", raw_bytes[:10], len(raw_bytes), type(raw_bytes)) | |
32 | +# self.audio_log.write(raw_bytes) | |
33 | + #sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") | |
34 | + #audio, _ = librosa.load(sf,sr=SAMPLING_RATE) | |
35 | + audio = raw_bytes | |
36 | + print("po překonvertování", audio, len(audio), type(audio), audio.dtype) | |
37 | + self.is_currently_final = is_final | |
38 | + self.online.insert_audio_chunk(audio) | |
39 | +# self.audio_log.write(audio) | |
40 | + self.audio_log.flush() | |
41 | + | |
42 | + print("inserted",file=self.logfile) | |
43 | + | |
44 | + def process_iter(self): | |
45 | + if self.is_currently_final: | |
46 | + return self.finish() | |
47 | + else: | |
48 | + print(self.online.audio_buffer) | |
49 | + ret = self.online.process_iter() | |
50 | + print("tady",file=self.logfile) | |
51 | + return ret | |
52 | + | |
53 | + def finish(self): | |
54 | + ret = self.online.finish() | |
55 | + self.online.init() | |
56 | + return ret | |
57 | + | |
58 | + | |
59 | + | |
60 | + | |
61 | +if __name__ == "__main__": | |
62 | + | |
63 | + import argparse | |
64 | + parser = argparse.ArgumentParser() | |
65 | + parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.") | |
66 | + add_shared_args(parser) | |
67 | + parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.') | |
68 | + parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.') | |
69 | + parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.') | |
70 | + | |
71 | + args = parser.parse_args() | |
72 | + | |
73 | + # reset to store stderr to different file stream, e.g. open(os.devnull,"w") | |
74 | + logfile = sys.stderr | |
75 | + | |
76 | + if args.offline and args.comp_unaware: | |
77 | + print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile) | |
78 | + sys.exit(1) | |
79 | + | |
80 | + audio_path = args.audio_path | |
81 | + | |
82 | + SAMPLING_RATE = 16000 | |
83 | + duration = len(load_audio(audio_path))/SAMPLING_RATE | |
84 | + print("Audio duration is: %2.2f seconds" % duration, file=logfile) | |
85 | + | |
86 | + size = args.model | |
87 | + language = args.lan | |
88 | + | |
89 | + t = time.time() | |
90 | + print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True) | |
91 | + | |
92 | + if args.backend == "faster-whisper": | |
93 | + asr_cls = FasterWhisperASR | |
94 | + else: | |
95 | + asr_cls = WhisperTimestampedASR | |
96 | + | |
97 | + asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir) | |
98 | + | |
99 | + if args.task == "translate": | |
100 | + asr.set_translate_task() | |
101 | + tgt_language = "en" # Whisper translates into English | |
102 | + else: | |
103 | + tgt_language = language # Whisper transcribes in this language | |
104 | + | |
105 | + | |
106 | + e = time.time() | |
107 | + print(f"done. It took {round(e-t,2)} seconds.",file=logfile) | |
108 | + | |
109 | + if args.vad: | |
110 | + print("setting VAD filter",file=logfile) | |
111 | + asr.use_vad() | |
112 | + | |
113 | + | |
114 | + min_chunk = args.min_chunk_size | |
115 | + if args.buffer_trimming == "sentence": | |
116 | + tokenizer = create_tokenizer(tgt_language) | |
117 | + else: | |
118 | + tokenizer = None | |
119 | + online = VACOnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) | |
120 | + | |
121 | + | |
122 | + # load the audio into the LRU cache before we start the timer | |
123 | + a = load_audio_chunk(audio_path,0,1) | |
124 | + | |
125 | + # warm up the ASR, because the very first transcribe takes much more time than the other | |
126 | + asr.transcribe(a) | |
127 | + | |
128 | + beg = args.start_at | |
129 | + start = time.time()-beg | |
130 | + | |
131 | + def output_transcript(o, now=None): | |
132 | + # output format in stdout is like: | |
133 | + # 4186.3606 0 1720 Takhle to je | |
134 | + # - the first three words are: | |
135 | + # - emission time from beginning of processing, in milliseconds | |
136 | + # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway | |
137 | + # - the next words: segment transcript | |
138 | + if now is None: | |
139 | + now = time.time()-start | |
140 | + if o[0] is not None: | |
141 | + print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True) | |
142 | + print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True) | |
143 | + else: | |
144 | + print(o,file=logfile,flush=True) | |
145 | + | |
146 | + if args.offline: ## offline mode processing (for testing/debugging) | |
147 | + a = load_audio(audio_path) | |
148 | + online.insert_audio_chunk(a) | |
149 | + try: | |
150 | + o = online.process_iter() | |
151 | + except AssertionError: | |
152 | + print("assertion error",file=logfile) | |
153 | + pass | |
154 | + else: | |
155 | + output_transcript(o) | |
156 | + now = None | |
157 | + elif args.comp_unaware: # computational unaware mode | |
158 | + end = beg + min_chunk | |
159 | + while True: | |
160 | + a = load_audio_chunk(audio_path,beg,end) | |
161 | + online.insert_audio_chunk(a) | |
162 | + try: | |
163 | + o = online.process_iter() | |
164 | + except AssertionError: | |
165 | + print("assertion error",file=logfile) | |
166 | + pass | |
167 | + else: | |
168 | + output_transcript(o, now=end) | |
169 | + | |
170 | + print(f"## last processed {end:.2f}s",file=logfile,flush=True) | |
171 | + | |
172 | + if end >= duration: | |
173 | + break | |
174 | + | |
175 | + beg = end | |
176 | + | |
177 | + if end + min_chunk > duration: | |
178 | + end = duration | |
179 | + else: | |
180 | + end += min_chunk | |
181 | + now = duration | |
182 | + | |
183 | + else: # online = simultaneous mode | |
184 | + end = 0 | |
185 | + while True: | |
186 | + now = time.time() - start | |
187 | + if now < end+min_chunk: | |
188 | + time.sleep(min_chunk+end-now) | |
189 | + end = time.time() - start | |
190 | + a = load_audio_chunk(audio_path,beg,end) | |
191 | + beg = end | |
192 | + online.insert_audio_chunk(a) | |
193 | + | |
194 | + try: | |
195 | + o = online.process_iter() | |
196 | + except AssertionError: | |
197 | + print("assertion error",file=logfile) | |
198 | + pass | |
199 | + else: | |
200 | + output_transcript(o) | |
201 | + now = time.time() - start | |
202 | + print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True) | |
203 | + | |
204 | + if end >= duration: | |
205 | + break | |
206 | + now = None | |
207 | + | |
208 | + o = online.finish() | |
209 | + output_transcript(o, now=now) |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?