

Merge branch 'main' into ayo-logging-fixes
@3e0223249375bf1a38bb80eff4370510881c1b93
--- README.md
+++ README.md
... | ... | @@ -183,7 +183,7 @@ |
183 | 183 |
|
184 | 184 |
### Server -- real-time from mic |
185 | 185 |
|
186 |
-`whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection. See help message (`-h` option). |
|
186 |
+`whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection and the `--warmup-file`. See the help message (`-h` option). |
|
187 | 187 |
|
188 | 188 |
Client example: |
189 | 189 |
|
--- line_packet.py
+++ line_packet.py
... | ... | @@ -2,8 +2,6 @@ |
2 | 2 |
|
3 | 3 |
"""Functions for sending and receiving individual lines of text over a socket. |
4 | 4 |
|
5 |
-Used by marian-server-server.py to communicate with the Marian worker. |
|
6 |
- |
|
7 | 5 |
A line is transmitted using one or more fixed-size packets of UTF-8 bytes |
8 | 6 |
containing: |
9 | 7 |
|
... | ... | @@ -11,6 +9,7 @@ |
11 | 9 |
|
12 | 10 |
- Zero or more \0 bytes as required to pad the packet to PACKET_SIZE |
13 | 11 |
|
12 |
+Originally from the UEDIN team of the ELITR project. |
|
14 | 13 |
""" |
15 | 14 |
|
16 | 15 |
PACKET_SIZE = 65536 |
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -559,7 +559,7 @@ |
559 | 559 |
|
560 | 560 |
def asr_factory(args, logfile=sys.stderr): |
561 | 561 |
""" |
562 |
- Creates and configures an ASR instance based on the specified backend and arguments. |
|
562 |
+ Creates and configures an ASR and ASR Online instance based on the specified backend and arguments. |
|
563 | 563 |
""" |
564 | 564 |
backend = args.backend |
565 | 565 |
if backend == "openai-api": |
... | ... | @@ -584,8 +584,23 @@ |
584 | 584 |
logging.info("Setting VAD filter") |
585 | 585 |
asr.use_vad() |
586 | 586 |
|
587 |
- return asr |
|
587 |
+ language = args.lan |
|
588 |
+ if args.task == "translate": |
|
589 |
+ asr.set_translate_task() |
|
590 |
+ tgt_language = "en" # Whisper translates into English |
|
591 |
+ else: |
|
592 |
+ tgt_language = language # Whisper transcribes in this language |
|
588 | 593 |
|
594 |
+ # Create the tokenizer |
|
595 |
+ if args.buffer_trimming == "sentence": |
|
596 |
+ tokenizer = create_tokenizer(tgt_language) |
|
597 |
+ else: |
|
598 |
+ tokenizer = None |
|
599 |
+ |
|
600 |
+ # Create the OnlineASRProcessor |
|
601 |
+ online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
602 |
+ |
|
603 |
+ return asr, online |
|
589 | 604 |
## main: |
590 | 605 |
|
591 | 606 |
if __name__ == "__main__": |
... | ... | @@ -613,27 +628,13 @@ |
613 | 628 |
duration = len(load_audio(audio_path))/SAMPLING_RATE |
614 | 629 |
logging.info("Audio duration is: %2.2f seconds" % duration) |
615 | 630 |
|
616 |
- asr = asr_factory(args, logfile=logfile) |
|
617 |
- language = args.lan |
|
618 |
- |
|
619 |
- if args.task == "translate": |
|
620 |
- asr.set_translate_task() |
|
621 |
- tgt_language = "en" # Whisper translates into English |
|
622 |
- else: |
|
623 |
- tgt_language = language # Whisper transcribes in this language |
|
624 |
- |
|
631 |
+ asr, online = asr_factory(args, logfile=logfile) |
|
625 | 632 |
min_chunk = args.min_chunk_size |
626 |
- if args.buffer_trimming == "sentence": |
|
627 |
- tokenizer = create_tokenizer(tgt_language) |
|
628 |
- else: |
|
629 |
- tokenizer = None |
|
630 |
- online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
631 |
- |
|
632 | 633 |
|
633 | 634 |
# load the audio into the LRU cache before we start the timer |
634 | 635 |
a = load_audio_chunk(audio_path,0,1) |
635 | 636 |
|
636 |
- # warm up the ASR, because the very first transcribe takes much more time than the other |
|
637 |
+ # warm up the ASR because the very first transcribe takes much more time than the other |
|
637 | 638 |
asr.transcribe(a) |
638 | 639 |
|
639 | 640 |
beg = args.start_at |
--- whisper_online_server.py
+++ whisper_online_server.py
... | ... | @@ -12,6 +12,8 @@ |
12 | 12 |
# server options |
13 | 13 |
parser.add_argument("--host", type=str, default='localhost') |
14 | 14 |
parser.add_argument("--port", type=int, default=43007) |
15 |
+parser.add_argument("--warmup-file", type=str, dest="warmup_file", |
|
16 |
+ help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") |
|
15 | 17 |
|
16 | 18 |
parser.add_argument("-l", "--log-level", dest="log_level", |
17 | 19 |
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], |
... | ... | @@ -33,16 +35,9 @@ |
33 | 35 |
|
34 | 36 |
size = args.model |
35 | 37 |
language = args.lan |
36 |
- |
|
37 |
-asr = asr_factory(args) |
|
38 |
- |
|
39 |
-if args.task == "translate": |
|
40 |
- asr.set_translate_task() |
|
41 |
- tgt_language = "en" |
|
42 |
-else: |
|
43 |
- tgt_language = language |
|
44 |
- |
|
38 |
+asr, online = asr_factory(args) |
|
45 | 39 |
min_chunk = args.min_chunk_size |
40 |
+ |
|
46 | 41 |
|
47 | 42 |
if args.buffer_trimming == "sentence": |
48 | 43 |
tokenizer = create_tokenizer(tgt_language) |
... | ... | @@ -50,20 +45,18 @@ |
50 | 45 |
tokenizer = None |
51 | 46 |
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
52 | 47 |
|
53 |
- |
|
54 |
- |
|
55 |
-demo_audio_path = "cs-maji-2.16k.wav" |
|
56 |
-if os.path.exists(demo_audio_path): |
|
57 |
- # load the audio into the LRU cache before we start the timer |
|
58 |
- logging.debug(f"Warming up on {demo_audio_path}") |
|
59 |
- a = load_audio_chunk(demo_audio_path,0,1) |
|
60 |
- |
|
61 |
- # TODO: it should be tested whether it's meaningful |
|
62 |
- # warm up the ASR, because the very first transcribe takes much more time than the other |
|
63 |
- asr.transcribe(a) |
|
64 |
- logging.debug("Whisper is warmed up") |
|
48 |
+# warm up the ASR because the very first transcribe takes more time than the others. |
|
49 |
+# Test results in https://github.com/ufal/whisper_streaming/pull/81 |
|
50 |
+msg = "Whisper is not warmed up. The first chunk processing may take longer." |
|
51 |
+if args.warmup_file: |
|
52 |
+ if os.path.isfile(args.warmup_file): |
|
53 |
+ a = load_audio_chunk(args.warmup_file,0,1) |
|
54 |
+ asr.transcribe(a) |
|
55 |
+ print("INFO: Whisper is warmed up.",file=sys.stderr) |
|
56 |
+ else: |
|
57 |
+ print("WARNING: The warm up file is not available. "+msg,file=sys.stderr) |
|
65 | 58 |
else: |
66 |
- logging.debug("Whisper is not warmed up") |
|
59 |
+ print("WARNING: " + msg, file=sys.stderr) |
|
67 | 60 |
|
68 | 61 |
|
69 | 62 |
######### Server objects |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?