Commit @e68952aec2897b5c09bec5d690844f9656c2b816 - yjyoon/whisper_streaming

Dominik Macháček 2023-06-02

server

@e68952aec2897b5c09bec5d690844f9656c2b816

6276797

e68952a

README.md

--- README.md

+++ README.md


 online.init()  # refresh if you're going to re-use the object for the next audio
 ```
 
+## Usage: Server
+
+`whisper_online_server.py` entry point has the same model option sas the entry point above, plus `--host` and `--port`, and no audio path.
+
+Client example:
+
+```
+arecord -f S16_LE -c1 -r 16000 -t raw -D default | nc localhost 43001
+```
+
+- arecord is an example program that sends audio from a sound device, in raw audio format -- 16000 sampling rate, mono channel, S16\_LE -- signed 16-bit integer low endian
+
+- nc is netcat, server host and port are e.g. localhost 430001
 
 
 ## Background

e68952a

whisper_online_server.py (added)

+++ whisper_online_server.py

...	...	@@ -0,0 +1,212 @@
	1	+#!/usr/bin/env python3
	2	+from whisper_online import *
	3	+
	4	+import sys
	5	+import argparse
	6	+import os
	7	+parser = argparse.ArgumentParser()
	8	+
	9	+# server options
	10	+parser.add_argument("--host", type=str, default='localhost')
	11	+parser.add_argument("--port", type=int, default=43007)
	12	+
	13	+
	14	+# options from whisper_online
	15	+# TODO: code repetition
	16	+
	17	+parser.add_argument('--min-chunk-size', type=float, default=1.0, help='Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.')
	18	+parser.add_argument('--model', type=str, default='large-v2', choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large".split(","),help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.")
	19	+parser.add_argument('--model_cache_dir', type=str, default=None, help="Overriding the default model cache dir where models downloaded from the hub are saved")
	20	+parser.add_argument('--model_dir', type=str, default=None, help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.")
	21	+parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
	22	+parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
	23	+parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
	24	+parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped"],help='Load only this backend for Whisper processing.')
	25	+parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
	26	+parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
	27	+args = parser.parse_args()
	28	+
	29	+
	30	+# setting whisper object by args
	31	+
	32	+SAMPLING_RATE = 16000
	33	+
	34	+size = args.model
	35	+language = args.lan
	36	+
	37	+t = time.time()
	38	+print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
	39	+
	40	+if args.backend == "faster-whisper":
	41	+ from faster_whisper import WhisperModel
	42	+ asr_cls = FasterWhisperASR
	43	+else:
	44	+ import whisper
	45	+ import whisper_timestamped
	46	+# from whisper_timestamped_model import WhisperTimestampedASR
	47	+ asr_cls = WhisperTimestampedASR
	48	+
	49	+asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
	50	+
	51	+if args.task == "translate":
	52	+ asr.set_translate_task()
	53	+
	54	+e = time.time()
	55	+print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
	56	+
	57	+if args.vad:
	58	+ print("setting VAD filter",file=sys.stderr)
	59	+ asr.use_vad()
	60	+
	61	+
	62	+min_chunk = args.min_chunk_size
	63	+online = OnlineASRProcessor(language,asr)
	64	+
	65	+
	66	+
	67	+demo_audio_path = "cs-maji-2.16k.wav"
	68	+if os.path.exists(demo_audio_path):
	69	+ # load the audio into the LRU cache before we start the timer
	70	+ a = load_audio_chunk(demo_audio_path,0,1)
	71	+
	72	+ # TODO: it should be tested whether it's meaningful
	73	+ # warm up the ASR, because the very first transcribe takes much more time than the other
	74	+ asr.transcribe(a)
	75	+else:
	76	+ print("Whisper is not warmed up",file=sys.stderr)
	77	+
	78	+
	79	+
	80	+
	81	+######### Server objects
	82	+
	83	+import line_packet
	84	+import socket
	85	+
	86	+import logging
	87	+
	88	+
	89	+class Connection:
	90	+ '''it wraps conn object'''
	91	+ PACKET_SIZE = 65536
	92	+
	93	+ def __init__(self, conn):
	94	+ self.conn = conn
	95	+ self.last_line = ""
	96	+
	97	+ self.conn.setblocking(True)
	98	+
	99	+ def send(self, line):
	100	+ '''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
	101	+ if line == self.last_line:
	102	+ return
	103	+ line_packet.send_one_line(self.conn, line)
	104	+ self.last_line = line
	105	+
	106	+ def receive_lines(self):
	107	+ in_line = line_packet.receive_lines(self.conn)
	108	+ return in_line
	109	+
	110	+ def non_blocking_receive_audio(self):
	111	+ r = self.conn.recv(self.PACKET_SIZE)
	112	+ return r
	113	+
	114	+
	115	+import io
	116	+import soundfile
	117	+
	118	+# wraps socket and ASR object, and serves one client connection.
	119	+# next client should be served by a new instance of this object
	120	+class ServerProcessor:
	121	+
	122	+ def __init__(self, c, online_asr_proc, min_chunk):
	123	+ self.connection = c
	124	+ self.online_asr_proc = online_asr_proc
	125	+ self.min_chunk = min_chunk
	126	+
	127	+ self.last_end = None
	128	+
	129	+ def receive_audio_chunk(self):
	130	+ # receive all audio that is available by this time
	131	+ # blocks operation if less than self.min_chunk seconds is available
	132	+ # unblocks if connection is closed or a chunk is available
	133	+ out = []
	134	+ while sum(len(x) for x in out) < self.min_chunk*SAMPLING_RATE:
	135	+ raw_bytes = self.connection.non_blocking_receive_audio()
	136	+ print(raw_bytes[:10])
	137	+ print(len(raw_bytes))
	138	+ if not raw_bytes:
	139	+ break
	140	+ sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	141	+ audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
	142	+ out.append(audio)
	143	+ if not out:
	144	+ return None
	145	+ return np.concatenate(out)
	146	+
	147	+ def format_output_transcript(self,o):
	148	+ # output format in stdout is like:
	149	+ # 0 1720 Takhle to je
	150	+ # - the first two words are:
	151	+ # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
	152	+ # - the next words: segment transcript
	153	+
	154	+ # This function differs from whisper_online.output_transcript in the following:
	155	+ # succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
	156	+ # Therefore, beg, is max of previous end and current beg outputed by Whisper.
	157	+ # Usually it differs negligibly, by appx 20 ms.
	158	+
	159	+ if o[0] is not None:
	160	+ beg, end = o[0]1000,o[1]1000
	161	+ if self.last_end is not None:
	162	+ beg = max(beg, self.last_end)
	163	+
	164	+ self.last_end = end
	165	+ print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
	166	+ return "%1.0f %1.0f %s" % (beg,end,o[2])
	167	+ else:
	168	+ print(o,file=sys.stderr,flush=True)
	169	+ return None
	170	+
	171	+ def send_result(self, o):
	172	+ msg = self.format_output_transcript(o)
	173	+ if msg is not None:
	174	+ self.connection.send(msg)
	175	+
	176	+ def process(self):
	177	+ # handle one client connection
	178	+ self.online_asr_proc.init()
	179	+ while True:
	180	+ a = self.receive_audio_chunk()
	181	+ if a is None:
	182	+ print("break here",file=sys.stderr)
	183	+ break
	184	+ self.online_asr_proc.insert_audio_chunk(a)
	185	+ o = online.process_iter()
	186	+ self.send_result(o)
	187	+# o = online.finish() # this should be working
	188	+# self.send_result(o)
	189	+
	190	+
	191	+
	192	+
	193	+# Start logging.
	194	+level = logging.INFO
	195	+logging.basicConfig(level=level, format='whisper-server-%(levelname)s: %(message)s')
	196	+
	197	+# server loop
	198	+
	199	+with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	200	+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
	201	+ s.bind((args.host, args.port))
	202	+ s.listen(1)
	203	+ logging.info('INFO: Listening on'+str((args.host, args.port)))
	204	+ while True:
	205	+ conn, addr = s.accept()
	206	+ logging.info('INFO: Connected to client on {}'.format(addr))
	207	+ connection = Connection(conn)
	208	+ proc = ServerProcessor(connection, online, min_chunk)
	209	+ proc.process()
	210	+ conn.close()
	211	+ logging.info('INFO: Connection to client closed')
	212	+logging.info('INFO: Connection closed, terminating.')

Add a comment

Open 0
Closed 0

List

...	...	@@ -110,6 +110,19 @@
110	110	online.init() # refresh if you're going to re-use the object for the next audio
111	111	```
112	112
	113	+## Usage: Server
	114	+
	115	+`whisper_online_server.py` entry point has the same model option sas the entry point above, plus `--host` and `--port`, and no audio path.
	116	+
	117	+Client example:
	118	+
	119	+```
	120	+arecord -f S16_LE -c1 -r 16000 -t raw -D default \| nc localhost 43001
	121	+```
	122	+
	123	+- arecord is an example program that sends audio from a sound device, in raw audio format -- 16000 sampling rate, mono channel, S16\_LE -- signed 16-bit integer low endian
	124	+
	125	+- nc is netcat, server host and port are e.g. localhost 430001
113	126
114	127
115	128	## Background

Delete comment