Commit @42dfe54c8c49681d5cc21f6d289a268a1d6d29ec - yjyoon/whisper_streaming

Ondrej Platek 2024-08-16

polishing code and note about installing deps for VAD

@42dfe54c8c49681d5cc21f6d289a268a1d6d29ec

9bf8954

42dfe54

README.md

--- README.md

+++ README.md


 
 1) ``pip install librosa`` -- audio processing library
 
+Note: for the VAD I need to `pip install torch torchaudio`.
+
 2) Whisper backend.
 
 Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.

9bf8954

42dfe54

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


 #!/usr/bin/env python3
 import sys
 import numpy as np
-import librosa  
+import librosa
 from functools import lru_cache
 import time
 import datetime

9bf8954

42dfe54

whisper_online_server.py

--- whisper_online_server.py

+++ whisper_online_server.py


 if args.backend == "faster-whisper":
     from faster_whisper import WhisperModel
     asr_cls = FasterWhisperASR
-else:
+elif args.backend == "whisper_timestamped":
     import whisper
-    import whisper_timestamped
-#    from whisper_timestamped_model import WhisperTimestampedASR
+    from whisper_online import WhisperTimestampedASR
     asr_cls = WhisperTimestampedASR
+else:
+    raise ValueError(f"Unknown {args.backend=}")
 
 asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
 

 else:
     tgt_language = language
 
-e = time.time()
-print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
+print(f"done. It took {round(time.time()-t,2)} seconds.",file=sys.stderr)
 
 if args.vad:
     print("setting VAD filter",file=sys.stderr)
     asr.use_vad()
 
 
-min_chunk = args.min_chunk_size
-
 if args.buffer_trimming == "sentence":
     tokenizer = create_tokenizer(tgt_language)
 else:
     tokenizer = None
 if not args.vac:
+    from whisper_online import OnlineASRProcessor
     online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
 else:
-    from whisper_online_vac import *
-    online = VACOnlineASRProcessor(min_chunk, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
+    from whisper_online_vac import VACOnlineASRProcessor
+    online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
 
 
 demo_audio_path = "cs-maji-2.16k.wav"

         conn, addr = s.accept()
         logging.info('INFO: Connected to client on {}'.format(addr))
         connection = Connection(conn)
-        proc = ServerProcessor(connection, online, min_chunk)
+        proc = ServerProcessor(connection, online, args.min_chunk_size)
         proc.process()
         conn.close()
         logging.info('INFO: Connection to client closed')

9bf8954

42dfe54

whisper_online_vac.py

--- whisper_online_vac.py

+++ whisper_online_vac.py


 
             if end >= duration:
                 break
-            
+
             beg = end
-            
+
             if end + min_chunk > duration:
                 end = duration
             else:

Add a comment

Open 0
Closed 0

List

...	...	@@ -33,6 +33,8 @@
33	33
34	34	1) ``pip install librosa`` -- audio processing library
35	35
	36	+Note: for the VAD I need to `pip install torch torchaudio`.
	37	+
36	38	2) Whisper backend.
37	39
38	40	Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.

...	...	@@ -30,11 +30,12 @@
30	30	if args.backend == "faster-whisper":
31	31	from faster_whisper import WhisperModel
32	32	asr_cls = FasterWhisperASR
33		-else:
	33	+elif args.backend == "whisper_timestamped":
34	34	import whisper
35		- import whisper_timestamped
36		-# from whisper_timestamped_model import WhisperTimestampedASR
	35	+ from whisper_online import WhisperTimestampedASR
37	36	asr_cls = WhisperTimestampedASR
	37	+else:
	38	+ raise ValueError(f"Unknown {args.backend=}")
38	39
39	40	asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
40	41
...	...	@@ -44,25 +45,23 @@
44	45	else:
45	46	tgt_language = language
46	47
47		-e = time.time()
48		-print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
	48	+print(f"done. It took {round(time.time()-t,2)} seconds.",file=sys.stderr)
49	49
50	50	if args.vad:
51	51	print("setting VAD filter",file=sys.stderr)
52	52	asr.use_vad()
53	53
54	54
55		-min_chunk = args.min_chunk_size
56		-
57	55	if args.buffer_trimming == "sentence":
58	56	tokenizer = create_tokenizer(tgt_language)
59	57	else:
60	58	tokenizer = None
61	59	if not args.vac:
	60	+ from whisper_online import OnlineASRProcessor
62	61	online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
63	62	else:
64		- from whisper_online_vac import *
65		- online = VACOnlineASRProcessor(min_chunk, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
	63	+ from whisper_online_vac import VACOnlineASRProcessor
	64	+ online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
66	65
67	66
68	67	demo_audio_path = "cs-maji-2.16k.wav"
...	...	@@ -219,7 +218,7 @@
219	218	conn, addr = s.accept()
220	219	logging.info('INFO: Connected to client on {}'.format(addr))
221	220	connection = Connection(conn)
222		- proc = ServerProcessor(connection, online, min_chunk)
	221	+ proc = ServerProcessor(connection, online, args.min_chunk_size)
223	222	proc.process()
224	223	conn.close()
225	224	logging.info('INFO: Connection to client closed')

...	...	@@ -165,9 +165,9 @@
165	165
166	166	if end >= duration:
167	167	break
168		-
	168	+
169	169	beg = end
170		-
	170	+
171	171	if end + min_chunk > duration:
172	172	end = duration
173	173	else:

...	...	@@ -1,7 +1,7 @@
1	1	#!/usr/bin/env python3
2	2	import sys
3	3	import numpy as np
4		-import librosa
	4	+import librosa
5	5	from functools import lru_cache
6	6	import time
7	7	import datetime

Delete comment