

VAC
- performance tests pending - TODO: timestamps after refresh are decreasing
@891cb12efe45b97e5e9daa9cafaaca0e2bb81802
--- voice_activity_controller.py
+++ voice_activity_controller.py
... | ... | @@ -1,18 +1,5 @@ |
1 | 1 |
import torch |
2 | 2 |
import numpy as np |
3 |
-# import sounddevice as sd |
|
4 |
-import torch |
|
5 |
-import numpy as np |
|
6 |
-import datetime |
|
7 |
- |
|
8 |
- |
|
9 |
-def int2float(sound): |
|
10 |
- abs_max = np.abs(sound).max() |
|
11 |
- sound = sound.astype('float32') |
|
12 |
- if abs_max > 0: |
|
13 |
- sound *= 1/32768 |
|
14 |
- sound = sound.squeeze() # depends on the use case |
|
15 |
- return sound |
|
16 | 3 |
|
17 | 4 |
class VoiceActivityController: |
18 | 5 |
def __init__( |
... | ... | @@ -22,10 +9,10 @@ |
22 | 9 |
min_speech_to_final_ms = 100, |
23 | 10 |
min_silence_duration_ms = 100, |
24 | 11 |
use_vad_result = True, |
25 |
- activity_detected_callback=None, |
|
12 |
+# activity_detected_callback=None, |
|
26 | 13 |
threshold =0.3 |
27 | 14 |
): |
28 |
- self.activity_detected_callback=activity_detected_callback |
|
15 |
+# self.activity_detected_callback=activity_detected_callback |
|
29 | 16 |
self.model, self.utils = torch.hub.load( |
30 | 17 |
repo_or_dir='snakers4/silero-vad', |
31 | 18 |
model='silero_vad' |
... | ... | @@ -42,7 +29,6 @@ |
42 | 29 |
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 |
43 | 30 |
|
44 | 31 |
self.use_vad_result = use_vad_result |
45 |
- self.last_marked_chunk = None |
|
46 | 32 |
self.threshold = threshold |
47 | 33 |
self.reset_states() |
48 | 34 |
|
... | ... | @@ -55,7 +41,13 @@ |
55 | 41 |
self.speech_len = 0 |
56 | 42 |
|
57 | 43 |
def apply_vad(self, audio): |
58 |
-# x = int2float(audio) |
|
44 |
+ """ |
|
45 |
+ returns: triple |
|
46 |
+ (voice_audio, |
|
47 |
+ speech_in_wav, |
|
48 |
+ silence_in_wav) |
|
49 |
+ |
|
50 |
+ """ |
|
59 | 51 |
x = audio |
60 | 52 |
if not torch.is_tensor(x): |
61 | 53 |
try: |
... | ... | @@ -64,16 +56,16 @@ |
64 | 56 |
raise TypeError("Audio cannot be casted to tensor. Cast it manually") |
65 | 57 |
|
66 | 58 |
speech_prob = self.model(x, self.sampling_rate).item() |
59 |
+ print("speech_prob",speech_prob) |
|
67 | 60 |
|
68 | 61 |
window_size_samples = len(x[0]) if x.dim() == 2 else len(x) |
69 | 62 |
self.current_sample += window_size_samples |
70 | 63 |
|
71 |
- |
|
72 |
- if (speech_prob >= self.threshold): |
|
64 |
+ if speech_prob >= self.threshold: # speech is detected |
|
73 | 65 |
self.temp_end = 0 |
74 | 66 |
return audio, window_size_samples, 0 |
75 | 67 |
|
76 |
- else : |
|
68 |
+ else: # silence detected, counting w |
|
77 | 69 |
if not self.temp_end: |
78 | 70 |
self.temp_end = self.current_sample |
79 | 71 |
|
... | ... | @@ -84,14 +76,12 @@ |
84 | 76 |
|
85 | 77 |
|
86 | 78 |
def detect_speech_iter(self, data, audio_in_int16 = False): |
87 |
-# audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data |
|
88 | 79 |
audio_block = data |
89 | 80 |
wav = audio_block |
90 | 81 |
|
91 |
- print(wav, len(wav), type(wav), wav.dtype) |
|
92 |
- |
|
93 | 82 |
is_final = False |
94 | 83 |
voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav) |
84 |
+ print("speech, last silence",speech_in_wav, last_silent_in_wav) |
|
95 | 85 |
|
96 | 86 |
|
97 | 87 |
if speech_in_wav > 0 : |
... | ... | @@ -101,16 +91,16 @@ |
101 | 91 |
# self.activity_detected_callback() |
102 | 92 |
|
103 | 93 |
self.last_silence_len += last_silent_in_wav |
94 |
+ print("self.last_silence_len",self.last_silence_len, self.final_silence_limit,self.last_silence_len>= self.final_silence_limit) |
|
95 |
+ print("self.speech_len, final_speech_limit",self.speech_len , self.final_speech_limit,self.speech_len >= self.final_speech_limit) |
|
104 | 96 |
if self.last_silence_len>= self.final_silence_limit and self.speech_len >= self.final_speech_limit: |
97 |
+ for i in range(10): print("TADY!!!") |
|
105 | 98 |
|
106 | 99 |
is_final = True |
107 | 100 |
self.last_silence_len= 0 |
108 | 101 |
self.speech_len = 0 |
109 | 102 |
|
110 |
-# return voice_audio.tobytes(), is_final |
|
111 | 103 |
return voice_audio, is_final |
112 |
- |
|
113 |
- |
|
114 | 104 |
|
115 | 105 |
def detect_user_speech(self, audio_stream, audio_in_int16 = False): |
116 | 106 |
self.last_silence_len= 0 |
... | ... | @@ -118,10 +108,3 @@ |
118 | 108 |
|
119 | 109 |
for data in audio_stream: # replace with your condition of choice |
120 | 110 |
yield self.detect_speech_iter(data, audio_in_int16) |
121 |
- |
|
122 |
- |
|
123 |
- |
|
124 |
- |
|
125 |
- |
|
126 |
- |
|
127 |
- |
--- whisper_online_server.py
+++ whisper_online_server.py
... | ... | @@ -9,7 +9,8 @@ |
9 | 9 |
# server options |
10 | 10 |
parser.add_argument("--host", type=str, default='localhost') |
11 | 11 |
parser.add_argument("--port", type=int, default=43007) |
12 |
- |
|
12 |
+parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.') |
|
13 |
+parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') |
|
13 | 14 |
|
14 | 15 |
# options from whisper_online |
15 | 16 |
add_shared_args(parser) |
... | ... | @@ -57,8 +58,11 @@ |
57 | 58 |
tokenizer = create_tokenizer(tgt_language) |
58 | 59 |
else: |
59 | 60 |
tokenizer = None |
60 |
-online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
61 |
- |
|
61 |
+if not args.vac: |
|
62 |
+ online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
63 |
+else: |
|
64 |
+ from whisper_online_vac import * |
|
65 |
+ online = VACOnlineASRProcessor(min_chunk, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
62 | 66 |
|
63 | 67 |
|
64 | 68 |
demo_audio_path = "cs-maji-2.16k.wav" |
--- whisper_online_vac.py
+++ whisper_online_vac.py
... | ... | @@ -7,52 +7,46 @@ |
7 | 7 |
|
8 | 8 |
class VACOnlineASRProcessor(OnlineASRProcessor): |
9 | 9 |
|
10 |
- def __init__(self, *a, **kw): |
|
11 |
- self.online = OnlineASRProcessor(*a, **kw) |
|
12 |
- self.vac = VoiceActivityController(use_vad_result = True) |
|
10 |
+ def __init__(self, online_chunk_size, *a, **kw): |
|
11 |
+ self.online_chunk_size = online_chunk_size |
|
13 | 12 |
|
14 |
- self.is_currently_final = False |
|
13 |
+ self.online = OnlineASRProcessor(*a, **kw) |
|
14 |
+ self.vac = VoiceActivityController(use_vad_result = False) |
|
15 |
+ |
|
15 | 16 |
self.logfile = self.online.logfile |
16 | 17 |
|
17 |
- #self.vac_buffer = io.BytesIO() |
|
18 |
- #self.vac_stream = self.vac.detect_user_speech(self.vac_buffer, audio_in_int16=False) |
|
19 |
- |
|
20 |
- self.audio_log = open("audio_log.wav","wb") |
|
18 |
+ self.init() |
|
21 | 19 |
|
22 | 20 |
def init(self): |
23 | 21 |
self.online.init() |
24 | 22 |
self.vac.reset_states() |
23 |
+ self.current_online_chunk_buffer_size = 0 |
|
24 |
+ self.is_currently_final = False |
|
25 |
+ |
|
25 | 26 |
|
26 | 27 |
def insert_audio_chunk(self, audio): |
27 |
- print(audio, len(audio), type(audio), audio.dtype) |
|
28 | 28 |
r = self.vac.detect_speech_iter(audio,audio_in_int16=False) |
29 |
- raw_bytes, is_final = r |
|
30 |
- print("is_final",is_final) |
|
31 |
- print("raw_bytes", raw_bytes[:10], len(raw_bytes), type(raw_bytes)) |
|
32 |
-# self.audio_log.write(raw_bytes) |
|
33 |
- #sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") |
|
34 |
- #audio, _ = librosa.load(sf,sr=SAMPLING_RATE) |
|
35 |
- audio = raw_bytes |
|
36 |
- print("po překonvertování", audio, len(audio), type(audio), audio.dtype) |
|
29 |
+ audio, is_final = r |
|
30 |
+ print(is_final) |
|
37 | 31 |
self.is_currently_final = is_final |
38 | 32 |
self.online.insert_audio_chunk(audio) |
39 |
-# self.audio_log.write(audio) |
|
40 |
- self.audio_log.flush() |
|
41 |
- |
|
42 |
- print("inserted",file=self.logfile) |
|
33 |
+ self.current_online_chunk_buffer_size += len(audio) |
|
43 | 34 |
|
44 | 35 |
def process_iter(self): |
45 | 36 |
if self.is_currently_final: |
46 | 37 |
return self.finish() |
47 |
- else: |
|
48 |
- print(self.online.audio_buffer) |
|
38 |
+ elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size: |
|
39 |
+ self.current_online_chunk_buffer_size = 0 |
|
49 | 40 |
ret = self.online.process_iter() |
50 |
- print("tady",file=self.logfile) |
|
51 | 41 |
return ret |
42 |
+ else: |
|
43 |
+ print("no online update, only VAD", file=self.logfile) |
|
44 |
+ return (None, None, "") |
|
52 | 45 |
|
53 | 46 |
def finish(self): |
54 | 47 |
ret = self.online.finish() |
55 | 48 |
self.online.init() |
49 |
+ self.current_online_chunk_buffer_size = 0 |
|
56 | 50 |
return ret |
57 | 51 |
|
58 | 52 |
|
... | ... | @@ -67,7 +61,7 @@ |
67 | 61 |
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.') |
68 | 62 |
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.') |
69 | 63 |
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.') |
70 |
- |
|
64 |
+ parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') |
|
71 | 65 |
args = parser.parse_args() |
72 | 66 |
|
73 | 67 |
# reset to store stderr to different file stream, e.g. open(os.devnull,"w") |
... | ... | @@ -111,12 +105,12 @@ |
111 | 105 |
asr.use_vad() |
112 | 106 |
|
113 | 107 |
|
114 |
- min_chunk = args.min_chunk_size |
|
108 |
+ min_chunk = args.vac_chunk_size |
|
115 | 109 |
if args.buffer_trimming == "sentence": |
116 | 110 |
tokenizer = create_tokenizer(tgt_language) |
117 | 111 |
else: |
118 | 112 |
tokenizer = None |
119 |
- online = VACOnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
113 |
+ online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
120 | 114 |
|
121 | 115 |
|
122 | 116 |
# load the audio into the LRU cache before we start the timer |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?