

faster-whisper support
@3605f32ffc97618d1bfb34062571462b4acc607b
--- README.md
+++ README.md
... | ... | @@ -3,19 +3,24 @@ |
3 | 3 |
|
4 | 4 |
## Installation |
5 | 5 |
|
6 |
+This code work with two kinds of backends. Both require |
|
7 |
+ |
|
6 | 8 |
``` |
7 |
-pip install git+https://github.com/linto-ai/whisper-timestamped |
|
8 |
-XDG_CACHE_HOME=$(pwd)/pip-cache pip install git+https://github.com/linto-ai/whisper-timestamped |
|
9 | 9 |
pip install librosa |
10 | 10 |
pip install opus-fast-mosestokenizer |
11 |
-pip install torch |
|
12 | 11 |
``` |
12 |
+ |
|
13 |
+The most recommended backend is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`. |
|
14 |
+ |
|
15 |
+Alternative, less restrictive, but slowe backend is [whisper-timestamped](https://github.com/linto-ai/whisper-timestamped): `pip install git+https://github.com/linto-ai/whisper-timestamped` |
|
16 |
+ |
|
17 |
+The backend is loaded only when chosen. The unused one does not have to be installed. |
|
13 | 18 |
|
14 | 19 |
## Usage |
15 | 20 |
|
16 | 21 |
``` |
17 | 22 |
(p3) $ python3 whisper_online.py -h |
18 |
-usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] audio_path |
|
23 |
+usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] [--backend {faster-whisper,whisper_timestamped}] audio_path |
|
19 | 24 |
|
20 | 25 |
positional arguments: |
21 | 26 |
audio_path |
... | ... | @@ -30,6 +35,8 @@ |
30 | 35 |
--lan LAN, --language LAN |
31 | 36 |
Language code for transcription, e.g. en,de,cs. |
32 | 37 |
--start_at START_AT Start processing audio at this time. |
38 |
+ --backend {faster-whisper,whisper_timestamped} |
|
39 |
+ Load only this backend for Whisper processing. |
|
33 | 40 |
``` |
34 | 41 |
|
35 | 42 |
Example: |
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -1,15 +1,10 @@ |
1 | 1 |
#!/usr/bin/env python3 |
2 | 2 |
import sys |
3 | 3 |
import numpy as np |
4 |
-import whisper |
|
5 |
-import whisper_timestamped |
|
6 |
-import librosa |
|
4 |
+import librosa |
|
7 | 5 |
from functools import lru_cache |
8 |
-import torch |
|
9 | 6 |
import time |
10 | 7 |
from mosestokenizer import MosesTokenizer |
11 |
-import json |
|
12 |
- |
|
13 | 8 |
|
14 | 9 |
@lru_cache |
15 | 10 |
def load_audio(fname): |
... | ... | @@ -22,10 +17,38 @@ |
22 | 17 |
end_s = int(end*16000) |
23 | 18 |
return audio[beg_s:end_s] |
24 | 19 |
|
25 |
-class WhisperASR: |
|
26 |
- def __init__(self, modelsize="small", lan="en", cache_dir="disk-cache-dir"): |
|
20 |
+ |
|
21 |
+# Whisper backend |
|
22 |
+ |
|
23 |
+class ASRBase: |
|
24 |
+ |
|
25 |
+ def __init__(self, modelsize, lan, cache_dir): |
|
27 | 26 |
self.original_language = lan |
28 |
- self.model = whisper.load_model(modelsize, download_root=cache_dir) |
|
27 |
+ |
|
28 |
+ self.model = self.load_model(modelsize, cache_dir) |
|
29 |
+ |
|
30 |
+ def load_model(self, modelsize, cache_dir): |
|
31 |
+ raise NotImplemented("mus be implemented in the child class") |
|
32 |
+ |
|
33 |
+ def transcribe(self, audio, init_prompt=""): |
|
34 |
+ raise NotImplemented("mus be implemented in the child class") |
|
35 |
+ |
|
36 |
+ |
|
37 |
+## requires imports: |
|
38 |
+# import whisper |
|
39 |
+# import whisper_timestamped |
|
40 |
+ |
|
41 |
+class WhisperTimestampedASR(ASRBase): |
|
42 |
+ """Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper. |
|
43 |
+ On the other hand, the installation for GPU could be easier. |
|
44 |
+ |
|
45 |
+ If used, requires imports: |
|
46 |
+ import whisper |
|
47 |
+ import whisper_timestamped |
|
48 |
+ """ |
|
49 |
+ |
|
50 |
+ def load_model(self, modelsize, cache_dir): |
|
51 |
+ return whisper.load_model(modelsize, download_root=cache_dir) |
|
29 | 52 |
|
30 | 53 |
def transcribe(self, audio, init_prompt=""): |
31 | 54 |
result = whisper_timestamped.transcribe_timestamped(self.model, audio, language=self.original_language, initial_prompt=init_prompt, verbose=None, condition_on_previous_text=True) |
... | ... | @@ -39,6 +62,52 @@ |
39 | 62 |
t = (w["start"],w["end"],w["text"]) |
40 | 63 |
o.append(t) |
41 | 64 |
return o |
65 |
+ |
|
66 |
+ def segments_end_ts(self, res): |
|
67 |
+ return [s["end"] for s in res["segments"]] |
|
68 |
+ |
|
69 |
+ |
|
70 |
+class FasterWhisperASR(ASRBase): |
|
71 |
+ """Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version. |
|
72 |
+ |
|
73 |
+ Requires imports, if used: |
|
74 |
+ import faster_whisper |
|
75 |
+ """ |
|
76 |
+ |
|
77 |
+ def load_model(self, modelsize, cache_dir): |
|
78 |
+ # cache_dir is not set, it seemed not working. Default ~/.cache/huggingface/hub is used. |
|
79 |
+ |
|
80 |
+ # this worked fast and reliably on NVIDIA L40 |
|
81 |
+ model = WhisperModel(modelsize, device="cuda", compute_type="float16") |
|
82 |
+ |
|
83 |
+ # or run on GPU with INT8 |
|
84 |
+ # tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower |
|
85 |
+ #model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") |
|
86 |
+ |
|
87 |
+ # or run on CPU with INT8 |
|
88 |
+ # tested: works, but slow, appx 10-times than cuda FP16 |
|
89 |
+ #model = WhisperModel(model_size, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/") |
|
90 |
+ return model |
|
91 |
+ |
|
92 |
+ def transcribe(self, audio, init_prompt=""): |
|
93 |
+ wt = False |
|
94 |
+ segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True) |
|
95 |
+ return list(segments) |
|
96 |
+ |
|
97 |
+ def ts_words(self, segments): |
|
98 |
+ o = [] |
|
99 |
+ for segment in segments: |
|
100 |
+ for word in segment.words: |
|
101 |
+ # stripping the spaces |
|
102 |
+ w = word.word.strip() |
|
103 |
+ t = (word.start, word.end, w) |
|
104 |
+ o.append(t) |
|
105 |
+ return o |
|
106 |
+ |
|
107 |
+ def segments_end_ts(self, res): |
|
108 |
+ return [s.end for s in res] |
|
109 |
+ |
|
110 |
+ |
|
42 | 111 |
|
43 | 112 |
def to_flush(sents, offset=0): |
44 | 113 |
# concatenates the timestamped words or sentences into one sequence that is flushed in one line |
... | ... | @@ -253,7 +322,7 @@ |
253 | 322 |
def chunk_completed_segment(self, res): |
254 | 323 |
if self.commited == []: return |
255 | 324 |
|
256 |
- ends = [s["end"] for s in res["segments"]] |
|
325 |
+ ends = self.asr.segments_end_ts(res) |
|
257 | 326 |
|
258 | 327 |
t = self.commited[-1][1] |
259 | 328 |
|
... | ... | @@ -320,6 +389,7 @@ |
320 | 389 |
|
321 | 390 |
|
322 | 391 |
|
392 |
+ |
|
323 | 393 |
## main: |
324 | 394 |
|
325 | 395 |
import argparse |
... | ... | @@ -330,6 +400,7 @@ |
330 | 400 |
parser.add_argument('--model_dir', type=str, default='disk-cache-dir', help="the path where Whisper models are saved (or downloaded to). Default: ./disk-cache-dir") |
331 | 401 |
parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.") |
332 | 402 |
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.') |
403 |
+parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped"],help='Load only this backend for Whisper processing.') |
|
333 | 404 |
args = parser.parse_args() |
334 | 405 |
|
335 | 406 |
audio_path = args.audio_path |
... | ... | @@ -343,7 +414,18 @@ |
343 | 414 |
|
344 | 415 |
t = time.time() |
345 | 416 |
print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True) |
346 |
-asr = WhisperASR(lan=language, modelsize=size) |
|
417 |
+#asr = WhisperASR(lan=language, modelsize=size) |
|
418 |
+ |
|
419 |
+if args.backend == "faster-whisper": |
|
420 |
+ from faster_whisper import WhisperModel |
|
421 |
+ asr_cls = FasterWhisperASR |
|
422 |
+else: |
|
423 |
+ import whisper |
|
424 |
+ import whisper_timestamped |
|
425 |
+# from whisper_timestamped_model import WhisperTimestampedASR |
|
426 |
+ asr_cls = WhisperTimestampedASR |
|
427 |
+ |
|
428 |
+asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_dir) |
|
347 | 429 |
e = time.time() |
348 | 430 |
print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr) |
349 | 431 |
|
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?