Commit @3605f32ffc97618d1bfb34062571462b4acc607b - yjyoon/whisper_streaming

Dominik Macháček 2023-04-19

faster-whisper support

@3605f32ffc97618d1bfb34062571462b4acc607b

1ad030d

3605f32

README.md

--- README.md

+++ README.md


 
 ## Installation
 
+This code work with two kinds of backends. Both require
+
 ```
-pip install git+https://github.com/linto-ai/whisper-timestamped
-XDG_CACHE_HOME=$(pwd)/pip-cache pip install git+https://github.com/linto-ai/whisper-timestamped
 pip install librosa
 pip install opus-fast-mosestokenizer
-pip install torch
 ```
+
+The most recommended backend is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
+
+Alternative, less restrictive, but slowe backend is [whisper-timestamped](https://github.com/linto-ai/whisper-timestamped): `pip install git+https://github.com/linto-ai/whisper-timestamped`
+
+The backend is loaded only when chosen. The unused one does not have to be installed.
 
 ## Usage
 
 ```
 (p3) $ python3 whisper_online.py -h
-usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] audio_path
+usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] [--backend {faster-whisper,whisper_timestamped}] audio_path
 
 positional arguments:
   audio_path

   --lan LAN, --language LAN
                         Language code for transcription, e.g. en,de,cs.
   --start_at START_AT   Start processing audio at this time.
+  --backend {faster-whisper,whisper_timestamped}
+                        Load only this backend for Whisper processing.
 ```
 
 Example:

1ad030d

3605f32

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


 #!/usr/bin/env python3
 import sys
 import numpy as np
-import whisper
-import whisper_timestamped
-import librosa
+import librosa  
 from functools import lru_cache
-import torch
 import time
 from mosestokenizer import MosesTokenizer
-import json
-
 
 @lru_cache
 def load_audio(fname):

     end_s = int(end*16000)
     return audio[beg_s:end_s]
 
-class WhisperASR:
-    def __init__(self, modelsize="small", lan="en", cache_dir="disk-cache-dir"):
+
+# Whisper backend
+
+class ASRBase:
+
+    def __init__(self, modelsize, lan, cache_dir):
         self.original_language = lan 
-        self.model = whisper.load_model(modelsize, download_root=cache_dir)
+
+        self.model = self.load_model(modelsize, cache_dir)
+
+    def load_model(self, modelsize, cache_dir):
+        raise NotImplemented("mus be implemented in the child class")
+
+    def transcribe(self, audio, init_prompt=""):
+        raise NotImplemented("mus be implemented in the child class")
+
+
+## requires imports:
+#      import whisper
+#      import whisper_timestamped
+
+class WhisperTimestampedASR(ASRBase):
+    """Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
+    On the other hand, the installation for GPU could be easier.
+
+    If used, requires imports:
+        import whisper
+        import whisper_timestamped
+    """
+
+    def load_model(self, modelsize, cache_dir):
+        return whisper.load_model(modelsize, download_root=cache_dir)
 
     def transcribe(self, audio, init_prompt=""):
         result = whisper_timestamped.transcribe_timestamped(self.model, audio, language=self.original_language, initial_prompt=init_prompt, verbose=None, condition_on_previous_text=True)

                 t = (w["start"],w["end"],w["text"])
                 o.append(t)
         return o
+
+    def segments_end_ts(self, res):
+        return [s["end"] for s in res["segments"]]
+
+
+class FasterWhisperASR(ASRBase):
+    """Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.
+
+    Requires imports, if used:
+        import faster_whisper
+    """
+
+    def load_model(self, modelsize, cache_dir):
+        # cache_dir is not set, it seemed not working. Default ~/.cache/huggingface/hub is used.
+
+        # this worked fast and reliably on NVIDIA L40
+        model = WhisperModel(modelsize, device="cuda", compute_type="float16")
+
+        # or run on GPU with INT8
+        # tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower
+        #model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
+
+        # or run on CPU with INT8
+        # tested: works, but slow, appx 10-times than cuda FP16
+        #model = WhisperModel(model_size, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/")
+        return model
+
+    def transcribe(self, audio, init_prompt=""):
+        wt = False
+        segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True)
+        return list(segments)
+
+    def ts_words(self, segments):
+        o = []
+        for segment in segments:
+            for word in segment.words:
+                # stripping the spaces
+                w = word.word.strip()
+                t = (word.start, word.end, w)
+                o.append(t)
+        return o
+
+    def segments_end_ts(self, res):
+        return [s.end for s in res]
+
+
 
 def to_flush(sents, offset=0):
     # concatenates the timestamped words or sentences into one sequence that is flushed in one line

     def chunk_completed_segment(self, res):
         if self.commited == []: return
 
-        ends = [s["end"] for s in res["segments"]]
+        ends = self.asr.segments_end_ts(res)
 
         t = self.commited[-1][1]
 

 
 
 
+
 ## main:
 
 import argparse

 parser.add_argument('--model_dir', type=str, default='disk-cache-dir', help="the path where Whisper models are saved (or downloaded to). Default: ./disk-cache-dir")
 parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
 parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
+parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped"],help='Load only this backend for Whisper processing.')
 args = parser.parse_args()
 
 audio_path = args.audio_path

 
 t = time.time()
 print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
-asr = WhisperASR(lan=language, modelsize=size)
+#asr = WhisperASR(lan=language, modelsize=size)
+
+if args.backend == "faster-whisper":
+    from faster_whisper import WhisperModel
+    asr_cls = FasterWhisperASR
+else:
+    import whisper
+    import whisper_timestamped
+#    from whisper_timestamped_model import WhisperTimestampedASR
+    asr_cls = WhisperTimestampedASR
+
+asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_dir)
 e = time.time()
 print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
 

Add a comment

Open 0
Closed 0

List

...	...	@@ -1,15 +1,10 @@
1	1	#!/usr/bin/env python3
2	2	import sys
3	3	import numpy as np
4		-import whisper
5		-import whisper_timestamped
6		-import librosa
	4	+import librosa
7	5	from functools import lru_cache
8		-import torch
9	6	import time
10	7	from mosestokenizer import MosesTokenizer
11		-import json
12		-
13	8
14	9	@lru_cache
15	10	def load_audio(fname):
...	...	@@ -22,10 +17,38 @@
22	17	end_s = int(end*16000)
23	18	return audio[beg_s:end_s]
24	19
25		-class WhisperASR:
26		- def __init__(self, modelsize="small", lan="en", cache_dir="disk-cache-dir"):
	20	+
	21	+# Whisper backend
	22	+
	23	+class ASRBase:
	24	+
	25	+ def __init__(self, modelsize, lan, cache_dir):
27	26	self.original_language = lan
28		- self.model = whisper.load_model(modelsize, download_root=cache_dir)
	27	+
	28	+ self.model = self.load_model(modelsize, cache_dir)
	29	+
	30	+ def load_model(self, modelsize, cache_dir):
	31	+ raise NotImplemented("mus be implemented in the child class")
	32	+
	33	+ def transcribe(self, audio, init_prompt=""):
	34	+ raise NotImplemented("mus be implemented in the child class")
	35	+
	36	+
	37	+## requires imports:
	38	+# import whisper
	39	+# import whisper_timestamped
	40	+
	41	+class WhisperTimestampedASR(ASRBase):
	42	+ """Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
	43	+ On the other hand, the installation for GPU could be easier.
	44	+
	45	+ If used, requires imports:
	46	+ import whisper
	47	+ import whisper_timestamped
	48	+ """
	49	+
	50	+ def load_model(self, modelsize, cache_dir):
	51	+ return whisper.load_model(modelsize, download_root=cache_dir)
29	52
30	53	def transcribe(self, audio, init_prompt=""):
31	54	result = whisper_timestamped.transcribe_timestamped(self.model, audio, language=self.original_language, initial_prompt=init_prompt, verbose=None, condition_on_previous_text=True)
...	...	@@ -39,6 +62,52 @@
39	62	t = (w["start"],w["end"],w["text"])
40	63	o.append(t)
41	64	return o
	65	+
	66	+ def segments_end_ts(self, res):
	67	+ return [s["end"] for s in res["segments"]]
	68	+
	69	+
	70	+class FasterWhisperASR(ASRBase):
	71	+ """Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.
	72	+
	73	+ Requires imports, if used:
	74	+ import faster_whisper
	75	+ """
	76	+
	77	+ def load_model(self, modelsize, cache_dir):
	78	+ # cache_dir is not set, it seemed not working. Default ~/.cache/huggingface/hub is used.
	79	+
	80	+ # this worked fast and reliably on NVIDIA L40
	81	+ model = WhisperModel(modelsize, device="cuda", compute_type="float16")
	82	+
	83	+ # or run on GPU with INT8
	84	+ # tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower
	85	+ #model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
	86	+
	87	+ # or run on CPU with INT8
	88	+ # tested: works, but slow, appx 10-times than cuda FP16
	89	+ #model = WhisperModel(model_size, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/")
	90	+ return model
	91	+
	92	+ def transcribe(self, audio, init_prompt=""):
	93	+ wt = False
	94	+ segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True)
	95	+ return list(segments)
	96	+
	97	+ def ts_words(self, segments):
	98	+ o = []
	99	+ for segment in segments:
	100	+ for word in segment.words:
	101	+ # stripping the spaces
	102	+ w = word.word.strip()
	103	+ t = (word.start, word.end, w)
	104	+ o.append(t)
	105	+ return o
	106	+
	107	+ def segments_end_ts(self, res):
	108	+ return [s.end for s in res]
	109	+
	110	+
42	111
43	112	def to_flush(sents, offset=0):
44	113	# concatenates the timestamped words or sentences into one sequence that is flushed in one line
...	...	@@ -253,7 +322,7 @@
253	322	def chunk_completed_segment(self, res):
254	323	if self.commited == []: return
255	324
256		- ends = [s["end"] for s in res["segments"]]
	325	+ ends = self.asr.segments_end_ts(res)
257	326
258	327	t = self.commited[-1][1]
259	328
...	...	@@ -320,6 +389,7 @@
320	389
321	390
322	391
	392	+
323	393	## main:
324	394
325	395	import argparse
...	...	@@ -330,6 +400,7 @@
330	400	parser.add_argument('--model_dir', type=str, default='disk-cache-dir', help="the path where Whisper models are saved (or downloaded to). Default: ./disk-cache-dir")
331	401	parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
332	402	parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
	403	+parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped"],help='Load only this backend for Whisper processing.')
333	404	args = parser.parse_args()
334	405
335	406	audio_path = args.audio_path
...	...	@@ -343,7 +414,18 @@
343	414
344	415	t = time.time()
345	416	print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
346		-asr = WhisperASR(lan=language, modelsize=size)
	417	+#asr = WhisperASR(lan=language, modelsize=size)
	418	+
	419	+if args.backend == "faster-whisper":
	420	+ from faster_whisper import WhisperModel
	421	+ asr_cls = FasterWhisperASR
	422	+else:
	423	+ import whisper
	424	+ import whisper_timestamped
	425	+# from whisper_timestamped_model import WhisperTimestampedASR
	426	+ asr_cls = WhisperTimestampedASR
	427	+
	428	+asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_dir)
347	429	e = time.time()
348	430	print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
349	431

...	...	@@ -3,19 +3,24 @@
3	3
4	4	## Installation
5	5
	6	+This code work with two kinds of backends. Both require
	7	+
6	8	```
7		-pip install git+https://github.com/linto-ai/whisper-timestamped
8		-XDG_CACHE_HOME=$(pwd)/pip-cache pip install git+https://github.com/linto-ai/whisper-timestamped
9	9	pip install librosa
10	10	pip install opus-fast-mosestokenizer
11		-pip install torch
12	11	```
	12	+
	13	+The most recommended backend is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
	14	+
	15	+Alternative, less restrictive, but slowe backend is [whisper-timestamped](https://github.com/linto-ai/whisper-timestamped): `pip install git+https://github.com/linto-ai/whisper-timestamped`
	16	+
	17	+The backend is loaded only when chosen. The unused one does not have to be installed.
13	18
14	19	## Usage
15	20
16	21	```
17	22	(p3) $ python3 whisper_online.py -h
18		-usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] audio_path
	23	+usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] [--backend {faster-whisper,whisper_timestamped}] audio_path
19	24
20	25	positional arguments:
21	26	audio_path
...	...	@@ -30,6 +35,8 @@
30	35	--lan LAN, --language LAN
31	36	Language code for transcription, e.g. en,de,cs.
32	37	--start_at START_AT Start processing audio at this time.
	38	+ --backend {faster-whisper,whisper_timestamped}
	39	+ Load only this backend for Whisper processing.
33	40	```
34	41
35	42	Example:

Delete comment