

unfork project, indicate files from whisper streaming
@2d5b45c42278443fb424e7eb382d6a9ae3017778
--- README.md
+++ README.md
... | ... | @@ -12,6 +12,11 @@ |
12 | 12 |
|
13 | 13 |
 |
14 | 14 |
|
15 |
+## Code Origins |
|
16 |
+ |
|
17 |
+This project reuses and extends code from the original Whisper Streaming repository: |
|
18 |
+- whisper_online.py: Contains code from whisper_streaming with the addition of the **MLX Whisper** backend for Apple Silicon, which is not present in the original repository. |
|
19 |
+- silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project. |
|
15 | 20 |
|
16 | 21 |
## Installation |
17 | 22 |
|
--- line_packet.py
... | ... | @@ -1,93 +0,0 @@ |
1 | -#!/usr/bin/env python3 | |
2 | - | |
3 | -"""Functions for sending and receiving individual lines of text over a socket. | |
4 | - | |
5 | -A line is transmitted using one or more fixed-size packets of UTF-8 bytes | |
6 | -containing: | |
7 | - | |
8 | - - Zero or more bytes of UTF-8, excluding \n and \0, followed by | |
9 | - | |
10 | - - Zero or more \0 bytes as required to pad the packet to PACKET_SIZE | |
11 | - | |
12 | -Originally from the UEDIN team of the ELITR project. | |
13 | -""" | |
14 | - | |
15 | -PACKET_SIZE = 65536 | |
16 | - | |
17 | - | |
18 | -def send_one_line(socket, text, pad_zeros=False): | |
19 | - """Sends a line of text over the given socket. | |
20 | - | |
21 | - The 'text' argument should contain a single line of text (line break | |
22 | - characters are optional). Line boundaries are determined by Python's | |
23 | - str.splitlines() function [1]. We also count '\0' as a line terminator. | |
24 | - If 'text' contains multiple lines then only the first will be sent. | |
25 | - | |
26 | - If the send fails then an exception will be raised. | |
27 | - | |
28 | - [1] https://docs.python.org/3.5/library/stdtypes.html#str.splitlines | |
29 | - | |
30 | - Args: | |
31 | - socket: a socket object. | |
32 | - text: string containing a line of text for transmission. | |
33 | - """ | |
34 | - text.replace('\0', '\n') | |
35 | - lines = text.splitlines() | |
36 | - first_line = '' if len(lines) == 0 else lines[0] | |
37 | - # TODO Is there a better way of handling bad input than 'replace'? | |
38 | - data = first_line.encode('utf-8', errors='replace') + b'\n' + (b'\0' if pad_zeros else b'') | |
39 | - for offset in range(0, len(data), PACKET_SIZE): | |
40 | - bytes_remaining = len(data) - offset | |
41 | - if bytes_remaining < PACKET_SIZE: | |
42 | - padding_length = PACKET_SIZE - bytes_remaining | |
43 | - packet = data[offset:] + (b'\0' * padding_length if pad_zeros else b'') | |
44 | - else: | |
45 | - packet = data[offset:offset+PACKET_SIZE] | |
46 | - socket.sendall(packet) | |
47 | - | |
48 | - | |
49 | -def receive_one_line(socket): | |
50 | - """Receives a line of text from the given socket. | |
51 | - | |
52 | - This function will (attempt to) receive a single line of text. If data is | |
53 | - currently unavailable then it will block until data becomes available or | |
54 | - the sender has closed the connection (in which case it will return an | |
55 | - empty string). | |
56 | - | |
57 | - The string should not contain any newline characters, but if it does then | |
58 | - only the first line will be returned. | |
59 | - | |
60 | - Args: | |
61 | - socket: a socket object. | |
62 | - | |
63 | - Returns: | |
64 | - A string representing a single line with a terminating newline or | |
65 | - None if the connection has been closed. | |
66 | - """ | |
67 | - data = b'' | |
68 | - while True: | |
69 | - packet = socket.recv(PACKET_SIZE) | |
70 | - if not packet: # Connection has been closed. | |
71 | - return None | |
72 | - data += packet | |
73 | - if b'\0' in packet: | |
74 | - break | |
75 | - # TODO Is there a better way of handling bad input than 'replace'? | |
76 | - text = data.decode('utf-8', errors='replace').strip('\0') | |
77 | - lines = text.split('\n') | |
78 | - return lines[0] + '\n' | |
79 | - | |
80 | - | |
81 | -def receive_lines(socket): | |
82 | - try: | |
83 | - data = socket.recv(PACKET_SIZE) | |
84 | - except BlockingIOError: | |
85 | - return [] | |
86 | - if data is None: # Connection has been closed. | |
87 | - return None | |
88 | - # TODO Is there a better way of handling bad input than 'replace'? | |
89 | - text = data.decode('utf-8', errors='replace').strip('\0') | |
90 | - lines = text.split('\n') | |
91 | - if len(lines)==1 and not lines[0]: | |
92 | - return None | |
93 | - return lines |
--- whisper_online_server.py
... | ... | @@ -1,184 +0,0 @@ |
1 | -#!/usr/bin/env python3 | |
2 | -from whisper_online import * | |
3 | - | |
4 | -import sys | |
5 | -import argparse | |
6 | -import os | |
7 | -import logging | |
8 | -import numpy as np | |
9 | - | |
10 | -logger = logging.getLogger(__name__) | |
11 | -parser = argparse.ArgumentParser() | |
12 | - | |
13 | -# server options | |
14 | -parser.add_argument("--host", type=str, default='localhost') | |
15 | -parser.add_argument("--port", type=int, default=43007) | |
16 | -parser.add_argument("--warmup-file", type=str, dest="warmup_file", | |
17 | - help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") | |
18 | - | |
19 | -# options from whisper_online | |
20 | -add_shared_args(parser) | |
21 | -args = parser.parse_args() | |
22 | - | |
23 | -set_logging(args,logger,other="") | |
24 | - | |
25 | -# setting whisper object by args | |
26 | - | |
27 | -SAMPLING_RATE = 16000 | |
28 | - | |
29 | -size = args.model | |
30 | -language = args.lan | |
31 | -asr, online = asr_factory(args) | |
32 | -min_chunk = args.min_chunk_size | |
33 | - | |
34 | -# warm up the ASR because the very first transcribe takes more time than the others. | |
35 | -# Test results in https://github.com/ufal/whisper_streaming/pull/81 | |
36 | -msg = "Whisper is not warmed up. The first chunk processing may take longer." | |
37 | -if args.warmup_file: | |
38 | - if os.path.isfile(args.warmup_file): | |
39 | - a = load_audio_chunk(args.warmup_file,0,1) | |
40 | - asr.transcribe(a) | |
41 | - logger.info("Whisper is warmed up.") | |
42 | - else: | |
43 | - logger.critical("The warm up file is not available. "+msg) | |
44 | - sys.exit(1) | |
45 | -else: | |
46 | - logger.warning(msg) | |
47 | - | |
48 | - | |
49 | -######### Server objects | |
50 | - | |
51 | -import line_packet | |
52 | -import socket | |
53 | - | |
54 | -class Connection: | |
55 | - '''it wraps conn object''' | |
56 | - PACKET_SIZE = 32000*5*60 # 5 minutes # was: 65536 | |
57 | - | |
58 | - def __init__(self, conn): | |
59 | - self.conn = conn | |
60 | - self.last_line = "" | |
61 | - | |
62 | - self.conn.setblocking(True) | |
63 | - | |
64 | - def send(self, line): | |
65 | - '''it doesn't send the same line twice, because it was problematic in online-text-flow-events''' | |
66 | - if line == self.last_line: | |
67 | - return | |
68 | - line_packet.send_one_line(self.conn, line) | |
69 | - self.last_line = line | |
70 | - | |
71 | - def receive_lines(self): | |
72 | - in_line = line_packet.receive_lines(self.conn) | |
73 | - return in_line | |
74 | - | |
75 | - def non_blocking_receive_audio(self): | |
76 | - try: | |
77 | - r = self.conn.recv(self.PACKET_SIZE) | |
78 | - return r | |
79 | - except ConnectionResetError: | |
80 | - return None | |
81 | - | |
82 | - | |
83 | -import io | |
84 | -import soundfile | |
85 | - | |
86 | -# wraps socket and ASR object, and serves one client connection. | |
87 | -# next client should be served by a new instance of this object | |
88 | -class ServerProcessor: | |
89 | - | |
90 | - def __init__(self, c, online_asr_proc, min_chunk): | |
91 | - self.connection = c | |
92 | - self.online_asr_proc = online_asr_proc | |
93 | - self.min_chunk = min_chunk | |
94 | - | |
95 | - self.last_end = None | |
96 | - | |
97 | - self.is_first = True | |
98 | - | |
99 | - def receive_audio_chunk(self): | |
100 | - # receive all audio that is available by this time | |
101 | - # blocks operation if less than self.min_chunk seconds is available | |
102 | - # unblocks if connection is closed or a chunk is available | |
103 | - out = [] | |
104 | - minlimit = self.min_chunk*SAMPLING_RATE | |
105 | - while sum(len(x) for x in out) < minlimit: | |
106 | - raw_bytes = self.connection.non_blocking_receive_audio() | |
107 | - if not raw_bytes: | |
108 | - break | |
109 | -# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10]) | |
110 | - sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") | |
111 | - audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32) | |
112 | - out.append(audio) | |
113 | - if not out: | |
114 | - return None | |
115 | - conc = np.concatenate(out) | |
116 | - if self.is_first and len(conc) < minlimit: | |
117 | - return None | |
118 | - self.is_first = False | |
119 | - return np.concatenate(out) | |
120 | - | |
121 | - def format_output_transcript(self,o): | |
122 | - # output format in stdout is like: | |
123 | - # 0 1720 Takhle to je | |
124 | - # - the first two words are: | |
125 | - # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway | |
126 | - # - the next words: segment transcript | |
127 | - | |
128 | - # This function differs from whisper_online.output_transcript in the following: | |
129 | - # succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it. | |
130 | - # Therefore, beg, is max of previous end and current beg outputed by Whisper. | |
131 | - # Usually it differs negligibly, by appx 20 ms. | |
132 | - | |
133 | - if o[0] is not None: | |
134 | - beg, end = o[0]*1000,o[1]*1000 | |
135 | - if self.last_end is not None: | |
136 | - beg = max(beg, self.last_end) | |
137 | - | |
138 | - self.last_end = end | |
139 | - print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr) | |
140 | - return "%1.0f %1.0f %s" % (beg,end,o[2]) | |
141 | - else: | |
142 | - logger.debug("No text in this segment") | |
143 | - return None | |
144 | - | |
145 | - def send_result(self, o): | |
146 | - msg = self.format_output_transcript(o) | |
147 | - if msg is not None: | |
148 | - self.connection.send(msg) | |
149 | - | |
150 | - def process(self): | |
151 | - # handle one client connection | |
152 | - self.online_asr_proc.init() | |
153 | - while True: | |
154 | - a = self.receive_audio_chunk() | |
155 | - if a is None: | |
156 | - break | |
157 | - self.online_asr_proc.insert_audio_chunk(a) | |
158 | - o = online.process_iter() | |
159 | - try: | |
160 | - self.send_result(o) | |
161 | - except BrokenPipeError: | |
162 | - logger.info("broken pipe -- connection closed?") | |
163 | - break | |
164 | - | |
165 | -# o = online.finish() # this should be working | |
166 | -# self.send_result(o) | |
167 | - | |
168 | - | |
169 | - | |
170 | -# server loop | |
171 | - | |
172 | -with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |
173 | - s.bind((args.host, args.port)) | |
174 | - s.listen(1) | |
175 | - logger.info('Listening on'+str((args.host, args.port))) | |
176 | - while True: | |
177 | - conn, addr = s.accept() | |
178 | - logger.info('Connected to client on {}'.format(addr)) | |
179 | - connection = Connection(conn) | |
180 | - proc = ServerProcessor(connection, online, args.min_chunk_size) | |
181 | - proc.process() | |
182 | - conn.close() | |
183 | - logger.info('Connection to client closed') | |
184 | -logger.info('Connection closed, terminating.') |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?