

polishing code and note about installing deps for VAD
@42dfe54c8c49681d5cc21f6d289a268a1d6d29ec
--- README.md
+++ README.md
... | ... | @@ -33,6 +33,8 @@ |
33 | 33 |
|
34 | 34 |
1) ``pip install librosa`` -- audio processing library |
35 | 35 |
|
36 |
+Note: for the VAD I need to `pip install torch torchaudio`. |
|
37 |
+ |
|
36 | 38 |
2) Whisper backend. |
37 | 39 |
|
38 | 40 |
Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`. |
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -1,7 +1,7 @@ |
1 | 1 |
#!/usr/bin/env python3 |
2 | 2 |
import sys |
3 | 3 |
import numpy as np |
4 |
-import librosa |
|
4 |
+import librosa |
|
5 | 5 |
from functools import lru_cache |
6 | 6 |
import time |
7 | 7 |
import datetime |
--- whisper_online_server.py
+++ whisper_online_server.py
... | ... | @@ -30,11 +30,12 @@ |
30 | 30 |
if args.backend == "faster-whisper": |
31 | 31 |
from faster_whisper import WhisperModel |
32 | 32 |
asr_cls = FasterWhisperASR |
33 |
-else: |
|
33 |
+elif args.backend == "whisper_timestamped": |
|
34 | 34 |
import whisper |
35 |
- import whisper_timestamped |
|
36 |
-# from whisper_timestamped_model import WhisperTimestampedASR |
|
35 |
+ from whisper_online import WhisperTimestampedASR |
|
37 | 36 |
asr_cls = WhisperTimestampedASR |
37 |
+else: |
|
38 |
+ raise ValueError(f"Unknown {args.backend=}") |
|
38 | 39 |
|
39 | 40 |
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir) |
40 | 41 |
|
... | ... | @@ -44,25 +45,23 @@ |
44 | 45 |
else: |
45 | 46 |
tgt_language = language |
46 | 47 |
|
47 |
-e = time.time() |
|
48 |
-print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr) |
|
48 |
+print(f"done. It took {round(time.time()-t,2)} seconds.",file=sys.stderr) |
|
49 | 49 |
|
50 | 50 |
if args.vad: |
51 | 51 |
print("setting VAD filter",file=sys.stderr) |
52 | 52 |
asr.use_vad() |
53 | 53 |
|
54 | 54 |
|
55 |
-min_chunk = args.min_chunk_size |
|
56 |
- |
|
57 | 55 |
if args.buffer_trimming == "sentence": |
58 | 56 |
tokenizer = create_tokenizer(tgt_language) |
59 | 57 |
else: |
60 | 58 |
tokenizer = None |
61 | 59 |
if not args.vac: |
60 |
+ from whisper_online import OnlineASRProcessor |
|
62 | 61 |
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
63 | 62 |
else: |
64 |
- from whisper_online_vac import * |
|
65 |
- online = VACOnlineASRProcessor(min_chunk, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
63 |
+ from whisper_online_vac import VACOnlineASRProcessor |
|
64 |
+ online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
66 | 65 |
|
67 | 66 |
|
68 | 67 |
demo_audio_path = "cs-maji-2.16k.wav" |
... | ... | @@ -219,7 +218,7 @@ |
219 | 218 |
conn, addr = s.accept() |
220 | 219 |
logging.info('INFO: Connected to client on {}'.format(addr)) |
221 | 220 |
connection = Connection(conn) |
222 |
- proc = ServerProcessor(connection, online, min_chunk) |
|
221 |
+ proc = ServerProcessor(connection, online, args.min_chunk_size) |
|
223 | 222 |
proc.process() |
224 | 223 |
conn.close() |
225 | 224 |
logging.info('INFO: Connection to client closed') |
--- whisper_online_vac.py
+++ whisper_online_vac.py
... | ... | @@ -165,9 +165,9 @@ |
165 | 165 |
|
166 | 166 |
if end >= duration: |
167 | 167 |
break |
168 |
- |
|
168 |
+ |
|
169 | 169 |
beg = end |
170 |
- |
|
170 |
+ |
|
171 | 171 |
if end + min_chunk > duration: |
172 | 172 |
end = duration |
173 | 173 |
else: |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?