

removing duplicated code -- whisper_online_vac
@0dd7ff9e880cc15763df36430448365b177cb3d3
--- voice_activity_controller.py
+++ voice_activity_controller.py
... | ... | @@ -48,6 +48,7 @@ |
48 | 48 |
silence_in_wav) |
49 | 49 |
|
50 | 50 |
""" |
51 |
+ print("applying vad here") |
|
51 | 52 |
x = audio |
52 | 53 |
if not torch.is_tensor(x): |
53 | 54 |
try: |
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -517,6 +517,59 @@ |
517 | 517 |
e = offset + sents[-1][1] |
518 | 518 |
return (b,e,t) |
519 | 519 |
|
520 |
+class VACOnlineASRProcessor(OnlineASRProcessor): |
|
521 |
+ '''Wraps OnlineASRProcessor with VAC (Voice Activity Controller). |
|
522 |
+ |
|
523 |
+ It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds), |
|
524 |
+ it runs VAD and continuously detects whether there is speech or not. |
|
525 |
+ When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately. |
|
526 |
+ ''' |
|
527 |
+ |
|
528 |
+ def __init__(self, online_chunk_size, *a, **kw): |
|
529 |
+ self.online_chunk_size = online_chunk_size |
|
530 |
+ |
|
531 |
+ self.online = OnlineASRProcessor(*a, **kw) |
|
532 |
+ from voice_activity_controller import VoiceActivityController |
|
533 |
+ self.vac = VoiceActivityController(use_vad_result = False) |
|
534 |
+ |
|
535 |
+ self.logfile = self.online.logfile |
|
536 |
+ |
|
537 |
+ self.init() |
|
538 |
+ |
|
539 |
+ def init(self): |
|
540 |
+ self.online.init() |
|
541 |
+ self.vac.reset_states() |
|
542 |
+ self.current_online_chunk_buffer_size = 0 |
|
543 |
+ self.is_currently_final = False |
|
544 |
+ |
|
545 |
+ |
|
546 |
+ def insert_audio_chunk(self, audio): |
|
547 |
+ r = self.vac.detect_speech_iter(audio,audio_in_int16=False) |
|
548 |
+ audio, is_final = r |
|
549 |
+ print(is_final) |
|
550 |
+ self.is_currently_final = is_final |
|
551 |
+ self.online.insert_audio_chunk(audio) |
|
552 |
+ self.current_online_chunk_buffer_size += len(audio) |
|
553 |
+ |
|
554 |
+ def process_iter(self): |
|
555 |
+ if self.is_currently_final: |
|
556 |
+ return self.finish() |
|
557 |
+ elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE*self.online_chunk_size: |
|
558 |
+ self.current_online_chunk_buffer_size = 0 |
|
559 |
+ ret = self.online.process_iter() |
|
560 |
+ return ret |
|
561 |
+ else: |
|
562 |
+ print("no online update, only VAD", file=self.logfile) |
|
563 |
+ return (None, None, "") |
|
564 |
+ |
|
565 |
+ def finish(self): |
|
566 |
+ ret = self.online.finish() |
|
567 |
+ self.online.init(keep_offset=True) |
|
568 |
+ self.current_online_chunk_buffer_size = 0 |
|
569 |
+ return ret |
|
570 |
+ |
|
571 |
+ |
|
572 |
+ |
|
520 | 573 |
WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",") |
521 | 574 |
|
522 | 575 |
def create_tokenizer(lan): |
... | ... | @@ -561,6 +614,8 @@ |
561 | 614 |
parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.") |
562 | 615 |
parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.") |
563 | 616 |
parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.') |
617 |
+ parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.') |
|
618 |
+ parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') |
|
564 | 619 |
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.') |
565 | 620 |
parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.') |
566 | 621 |
parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.') |
... | ... | @@ -607,7 +662,11 @@ |
607 | 662 |
tokenizer = None |
608 | 663 |
|
609 | 664 |
# Create the OnlineASRProcessor |
610 |
- online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
665 |
+ if args.vac: |
|
666 |
+ |
|
667 |
+ online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
668 |
+ else: |
|
669 |
+ online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
611 | 670 |
|
612 | 671 |
return asr, online |
613 | 672 |
|
... | ... | @@ -652,7 +711,10 @@ |
652 | 711 |
logger.info("Audio duration is: %2.2f seconds" % duration) |
653 | 712 |
|
654 | 713 |
asr, online = asr_factory(args, logfile=logfile) |
655 |
- min_chunk = args.min_chunk_size |
|
714 |
+ if args.vac: |
|
715 |
+ min_chunk = args.vac_chunk_size |
|
716 |
+ else: |
|
717 |
+ min_chunk = args.min_chunk_size |
|
656 | 718 |
|
657 | 719 |
# load the audio into the LRU cache before we start the timer |
658 | 720 |
a = load_audio_chunk(audio_path,0,1) |
--- whisper_online_server.py
+++ whisper_online_server.py
... | ... | @@ -13,8 +13,6 @@ |
13 | 13 |
# server options |
14 | 14 |
parser.add_argument("--host", type=str, default='localhost') |
15 | 15 |
parser.add_argument("--port", type=int, default=43007) |
16 |
-parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.') |
|
17 |
-parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') |
|
18 | 16 |
parser.add_argument("--warmup-file", type=str, dest="warmup_file", |
19 | 17 |
help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") |
20 | 18 |
|
... | ... | @@ -108,7 +106,7 @@ |
108 | 106 |
raw_bytes = self.connection.non_blocking_receive_audio() |
109 | 107 |
if not raw_bytes: |
110 | 108 |
break |
111 |
- print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10]) |
|
109 |
+# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10]) |
|
112 | 110 |
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") |
113 | 111 |
audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32) |
114 | 112 |
out.append(audio) |
--- whisper_online_vac.py
... | ... | @@ -1,203 +0,0 @@ |
1 | -from whisper_online import * | |
2 | -from voice_activity_controller import * | |
3 | -import soundfile | |
4 | -import io | |
5 | - | |
6 | -SAMPLING_RATE = 16000 | |
7 | - | |
8 | -class VACOnlineASRProcessor(OnlineASRProcessor): | |
9 | - | |
10 | - def __init__(self, online_chunk_size, *a, **kw): | |
11 | - self.online_chunk_size = online_chunk_size | |
12 | - | |
13 | - self.online = OnlineASRProcessor(*a, **kw) | |
14 | - self.vac = VoiceActivityController(use_vad_result = False) | |
15 | - | |
16 | - self.logfile = self.online.logfile | |
17 | - | |
18 | - self.init() | |
19 | - | |
20 | - def init(self): | |
21 | - self.online.init() | |
22 | - self.vac.reset_states() | |
23 | - self.current_online_chunk_buffer_size = 0 | |
24 | - self.is_currently_final = False | |
25 | - | |
26 | - | |
27 | - def insert_audio_chunk(self, audio): | |
28 | - r = self.vac.detect_speech_iter(audio,audio_in_int16=False) | |
29 | - audio, is_final = r | |
30 | - print(is_final) | |
31 | - self.is_currently_final = is_final | |
32 | - self.online.insert_audio_chunk(audio) | |
33 | - self.current_online_chunk_buffer_size += len(audio) | |
34 | - | |
35 | - def process_iter(self): | |
36 | - if self.is_currently_final: | |
37 | - return self.finish() | |
38 | - elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size: | |
39 | - self.current_online_chunk_buffer_size = 0 | |
40 | - ret = self.online.process_iter() | |
41 | - return ret | |
42 | - else: | |
43 | - print("no online update, only VAD", file=self.logfile) | |
44 | - return (None, None, "") | |
45 | - | |
46 | - def finish(self): | |
47 | - ret = self.online.finish() | |
48 | - self.online.init(keep_offset=True) | |
49 | - self.current_online_chunk_buffer_size = 0 | |
50 | - return ret | |
51 | - | |
52 | - | |
53 | - | |
54 | - | |
55 | -if __name__ == "__main__": | |
56 | - | |
57 | - import argparse | |
58 | - parser = argparse.ArgumentParser() | |
59 | - parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.") | |
60 | - add_shared_args(parser) | |
61 | - parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.') | |
62 | - parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.') | |
63 | - parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.') | |
64 | - parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') | |
65 | - args = parser.parse_args() | |
66 | - | |
67 | - # reset to store stderr to different file stream, e.g. open(os.devnull,"w") | |
68 | - logfile = sys.stderr | |
69 | - | |
70 | - if args.offline and args.comp_unaware: | |
71 | - print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile) | |
72 | - sys.exit(1) | |
73 | - | |
74 | - audio_path = args.audio_path | |
75 | - | |
76 | - SAMPLING_RATE = 16000 | |
77 | - duration = len(load_audio(audio_path))/SAMPLING_RATE | |
78 | - print("Audio duration is: %2.2f seconds" % duration, file=logfile) | |
79 | - | |
80 | - size = args.model | |
81 | - language = args.lan | |
82 | - | |
83 | - t = time.time() | |
84 | - print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True) | |
85 | - | |
86 | - if args.backend == "faster-whisper": | |
87 | - asr_cls = FasterWhisperASR | |
88 | - else: | |
89 | - asr_cls = WhisperTimestampedASR | |
90 | - | |
91 | - asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir) | |
92 | - | |
93 | - if args.task == "translate": | |
94 | - asr.set_translate_task() | |
95 | - tgt_language = "en" # Whisper translates into English | |
96 | - else: | |
97 | - tgt_language = language # Whisper transcribes in this language | |
98 | - | |
99 | - | |
100 | - e = time.time() | |
101 | - print(f"done. It took {round(e-t,2)} seconds.",file=logfile) | |
102 | - | |
103 | - if args.vad: | |
104 | - print("setting VAD filter",file=logfile) | |
105 | - asr.use_vad() | |
106 | - | |
107 | - | |
108 | - min_chunk = args.vac_chunk_size | |
109 | - if args.buffer_trimming == "sentence": | |
110 | - tokenizer = create_tokenizer(tgt_language) | |
111 | - else: | |
112 | - tokenizer = None | |
113 | - online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) | |
114 | - | |
115 | - | |
116 | - # load the audio into the LRU cache before we start the timer | |
117 | - a = load_audio_chunk(audio_path,0,1) | |
118 | - | |
119 | - # warm up the ASR, because the very first transcribe takes much more time than the other | |
120 | - asr.transcribe(a) | |
121 | - | |
122 | - beg = args.start_at | |
123 | - start = time.time()-beg | |
124 | - | |
125 | - def output_transcript(o, now=None): | |
126 | - # output format in stdout is like: | |
127 | - # 4186.3606 0 1720 Takhle to je | |
128 | - # - the first three words are: | |
129 | - # - emission time from beginning of processing, in milliseconds | |
130 | - # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway | |
131 | - # - the next words: segment transcript | |
132 | - if now is None: | |
133 | - now = time.time()-start | |
134 | - if o[0] is not None: | |
135 | - print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True) | |
136 | - print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True) | |
137 | - else: | |
138 | - print(o,file=logfile,flush=True) | |
139 | - | |
140 | - if args.offline: ## offline mode processing (for testing/debugging) | |
141 | - a = load_audio(audio_path) | |
142 | - online.insert_audio_chunk(a) | |
143 | - try: | |
144 | - o = online.process_iter() | |
145 | - except AssertionError: | |
146 | - print("assertion error",file=logfile) | |
147 | - pass | |
148 | - else: | |
149 | - output_transcript(o) | |
150 | - now = None | |
151 | - elif args.comp_unaware: # computational unaware mode | |
152 | - end = beg + min_chunk | |
153 | - while True: | |
154 | - a = load_audio_chunk(audio_path,beg,end) | |
155 | - online.insert_audio_chunk(a) | |
156 | - try: | |
157 | - o = online.process_iter() | |
158 | - except AssertionError: | |
159 | - print("assertion error",file=logfile) | |
160 | - pass | |
161 | - else: | |
162 | - output_transcript(o, now=end) | |
163 | - | |
164 | - print(f"## last processed {end:.2f}s",file=logfile,flush=True) | |
165 | - | |
166 | - if end >= duration: | |
167 | - break | |
168 | - | |
169 | - beg = end | |
170 | - | |
171 | - if end + min_chunk > duration: | |
172 | - end = duration | |
173 | - else: | |
174 | - end += min_chunk | |
175 | - now = duration | |
176 | - | |
177 | - else: # online = simultaneous mode | |
178 | - end = 0 | |
179 | - while True: | |
180 | - now = time.time() - start | |
181 | - if now < end+min_chunk: | |
182 | - time.sleep(min_chunk+end-now) | |
183 | - end = time.time() - start | |
184 | - a = load_audio_chunk(audio_path,beg,end) | |
185 | - beg = end | |
186 | - online.insert_audio_chunk(a) | |
187 | - | |
188 | - try: | |
189 | - o = online.process_iter() | |
190 | - except AssertionError: | |
191 | - print("assertion error",file=logfile) | |
192 | - pass | |
193 | - else: | |
194 | - output_transcript(o) | |
195 | - now = time.time() - start | |
196 | - print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True) | |
197 | - | |
198 | - if end >= duration: | |
199 | - break | |
200 | - now = None | |
201 | - | |
202 | - o = online.finish() | |
203 | - output_transcript(o, now=now) |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?