Commit @2d5b45c42278443fb424e7eb382d6a9ae3017778 - yjyoon/whisper_streaming

Quentin Fuxa 2024-12-19

unfork project, indicate files from whisper streaming

@2d5b45c42278443fb424e7eb382d6a9ae3017778

3d91a2a

2d5b45c

README.md

--- README.md

+++ README.md


 
 ![Demo Screenshot](src/demo.png)
 
+##  Code Origins
+
+This project reuses and extends code from the original Whisper Streaming repository:
+- whisper_online.py: Contains code from whisper_streaming with the addition of the **MLX Whisper** backend for Apple Silicon, which is not present in the original repository.
+- silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project.
 
 ## Installation
 

3d91a2a

line_packet.py (deleted)

--- line_packet.py

...	...	@@ -1,93 +0,0 @@
	1	-#!/usr/bin/env python3
	2	-
	3	-"""Functions for sending and receiving individual lines of text over a socket.
	4	-
	5	-A line is transmitted using one or more fixed-size packets of UTF-8 bytes
	6	-containing:
	7	-
	8	- - Zero or more bytes of UTF-8, excluding \n and \0, followed by
	9	-
	10	- - Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
	11	-
	12	-Originally from the UEDIN team of the ELITR project.
	13	-"""
	14	-
	15	-PACKET_SIZE = 65536
	16	-
	17	-
	18	-def send_one_line(socket, text, pad_zeros=False):
	19	- """Sends a line of text over the given socket.
	20	-
	21	- The 'text' argument should contain a single line of text (line break
	22	- characters are optional). Line boundaries are determined by Python's
	23	- str.splitlines() function [1]. We also count '\0' as a line terminator.
	24	- If 'text' contains multiple lines then only the first will be sent.
	25	-
	26	- If the send fails then an exception will be raised.
	27	-
	28	- [1] https://docs.python.org/3.5/library/stdtypes.html#str.splitlines
	29	-
	30	- Args:
	31	- socket: a socket object.
	32	- text: string containing a line of text for transmission.
	33	- """
	34	- text.replace('\0', '\n')
	35	- lines = text.splitlines()
	36	- first_line = '' if len(lines) == 0 else lines[0]
	37	- # TODO Is there a better way of handling bad input than 'replace'?
	38	- data = first_line.encode('utf-8', errors='replace') + b'\n' + (b'\0' if pad_zeros else b'')
	39	- for offset in range(0, len(data), PACKET_SIZE):
	40	- bytes_remaining = len(data) - offset
	41	- if bytes_remaining < PACKET_SIZE:
	42	- padding_length = PACKET_SIZE - bytes_remaining
	43	- packet = data[offset:] + (b'\0' * padding_length if pad_zeros else b'')
	44	- else:
	45	- packet = data[offset:offset+PACKET_SIZE]
	46	- socket.sendall(packet)
	47	-
	48	-
	49	-def receive_one_line(socket):
	50	- """Receives a line of text from the given socket.
	51	-
	52	- This function will (attempt to) receive a single line of text. If data is
	53	- currently unavailable then it will block until data becomes available or
	54	- the sender has closed the connection (in which case it will return an
	55	- empty string).
	56	-
	57	- The string should not contain any newline characters, but if it does then
	58	- only the first line will be returned.
	59	-
	60	- Args:
	61	- socket: a socket object.
	62	-
	63	- Returns:
	64	- A string representing a single line with a terminating newline or
	65	- None if the connection has been closed.
	66	- """
	67	- data = b''
	68	- while True:
	69	- packet = socket.recv(PACKET_SIZE)
	70	- if not packet: # Connection has been closed.
	71	- return None
	72	- data += packet
	73	- if b'\0' in packet:
	74	- break
	75	- # TODO Is there a better way of handling bad input than 'replace'?
	76	- text = data.decode('utf-8', errors='replace').strip('\0')
	77	- lines = text.split('\n')
	78	- return lines[0] + '\n'
	79	-
	80	-
	81	-def receive_lines(socket):
	82	- try:
	83	- data = socket.recv(PACKET_SIZE)
	84	- except BlockingIOError:
	85	- return []
	86	- if data is None: # Connection has been closed.
	87	- return None
	88	- # TODO Is there a better way of handling bad input than 'replace'?
	89	- text = data.decode('utf-8', errors='replace').strip('\0')
	90	- lines = text.split('\n')
	91	- if len(lines)==1 and not lines[0]:
	92	- return None
	93	- return lines

3d91a2a

whisper_online_server.py (deleted)

--- whisper_online_server.py

...	...	@@ -1,184 +0,0 @@
	1	-#!/usr/bin/env python3
	2	-from whisper_online import *
	3	-
	4	-import sys
	5	-import argparse
	6	-import os
	7	-import logging
	8	-import numpy as np
	9	-
	10	-logger = logging.getLogger(__name__)
	11	-parser = argparse.ArgumentParser()
	12	-
	13	-# server options
	14	-parser.add_argument("--host", type=str, default='localhost')
	15	-parser.add_argument("--port", type=int, default=43007)
	16	-parser.add_argument("--warmup-file", type=str, dest="warmup_file",
	17	- help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
	18	-
	19	-# options from whisper_online
	20	-add_shared_args(parser)
	21	-args = parser.parse_args()
	22	-
	23	-set_logging(args,logger,other="")
	24	-
	25	-# setting whisper object by args
	26	-
	27	-SAMPLING_RATE = 16000
	28	-
	29	-size = args.model
	30	-language = args.lan
	31	-asr, online = asr_factory(args)
	32	-min_chunk = args.min_chunk_size
	33	-
	34	-# warm up the ASR because the very first transcribe takes more time than the others.
	35	-# Test results in https://github.com/ufal/whisper_streaming/pull/81
	36	-msg = "Whisper is not warmed up. The first chunk processing may take longer."
	37	-if args.warmup_file:
	38	- if os.path.isfile(args.warmup_file):
	39	- a = load_audio_chunk(args.warmup_file,0,1)
	40	- asr.transcribe(a)
	41	- logger.info("Whisper is warmed up.")
	42	- else:
	43	- logger.critical("The warm up file is not available. "+msg)
	44	- sys.exit(1)
	45	-else:
	46	- logger.warning(msg)
	47	-
	48	-
	49	-######### Server objects
	50	-
	51	-import line_packet
	52	-import socket
	53	-
	54	-class Connection:
	55	- '''it wraps conn object'''
	56	- PACKET_SIZE = 32000560 # 5 minutes # was: 65536
	57	-
	58	- def __init__(self, conn):
	59	- self.conn = conn
	60	- self.last_line = ""
	61	-
	62	- self.conn.setblocking(True)
	63	-
	64	- def send(self, line):
	65	- '''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
	66	- if line == self.last_line:
	67	- return
	68	- line_packet.send_one_line(self.conn, line)
	69	- self.last_line = line
	70	-
	71	- def receive_lines(self):
	72	- in_line = line_packet.receive_lines(self.conn)
	73	- return in_line
	74	-
	75	- def non_blocking_receive_audio(self):
	76	- try:
	77	- r = self.conn.recv(self.PACKET_SIZE)
	78	- return r
	79	- except ConnectionResetError:
	80	- return None
	81	-
	82	-
	83	-import io
	84	-import soundfile
	85	-
	86	-# wraps socket and ASR object, and serves one client connection.
	87	-# next client should be served by a new instance of this object
	88	-class ServerProcessor:
	89	-
	90	- def __init__(self, c, online_asr_proc, min_chunk):
	91	- self.connection = c
	92	- self.online_asr_proc = online_asr_proc
	93	- self.min_chunk = min_chunk
	94	-
	95	- self.last_end = None
	96	-
	97	- self.is_first = True
	98	-
	99	- def receive_audio_chunk(self):
	100	- # receive all audio that is available by this time
	101	- # blocks operation if less than self.min_chunk seconds is available
	102	- # unblocks if connection is closed or a chunk is available
	103	- out = []
	104	- minlimit = self.min_chunk*SAMPLING_RATE
	105	- while sum(len(x) for x in out) < minlimit:
	106	- raw_bytes = self.connection.non_blocking_receive_audio()
	107	- if not raw_bytes:
	108	- break
	109	-# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
	110	- sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
	111	- audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
	112	- out.append(audio)
	113	- if not out:
	114	- return None
	115	- conc = np.concatenate(out)
	116	- if self.is_first and len(conc) < minlimit:
	117	- return None
	118	- self.is_first = False
	119	- return np.concatenate(out)
	120	-
	121	- def format_output_transcript(self,o):
	122	- # output format in stdout is like:
	123	- # 0 1720 Takhle to je
	124	- # - the first two words are:
	125	- # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
	126	- # - the next words: segment transcript
	127	-
	128	- # This function differs from whisper_online.output_transcript in the following:
	129	- # succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
	130	- # Therefore, beg, is max of previous end and current beg outputed by Whisper.
	131	- # Usually it differs negligibly, by appx 20 ms.
	132	-
	133	- if o[0] is not None:
	134	- beg, end = o[0]1000,o[1]1000
	135	- if self.last_end is not None:
	136	- beg = max(beg, self.last_end)
	137	-
	138	- self.last_end = end
	139	- print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
	140	- return "%1.0f %1.0f %s" % (beg,end,o[2])
	141	- else:
	142	- logger.debug("No text in this segment")
	143	- return None
	144	-
	145	- def send_result(self, o):
	146	- msg = self.format_output_transcript(o)
	147	- if msg is not None:
	148	- self.connection.send(msg)
	149	-
	150	- def process(self):
	151	- # handle one client connection
	152	- self.online_asr_proc.init()
	153	- while True:
	154	- a = self.receive_audio_chunk()
	155	- if a is None:
	156	- break
	157	- self.online_asr_proc.insert_audio_chunk(a)
	158	- o = online.process_iter()
	159	- try:
	160	- self.send_result(o)
	161	- except BrokenPipeError:
	162	- logger.info("broken pipe -- connection closed?")
	163	- break
	164	-
	165	-# o = online.finish() # this should be working
	166	-# self.send_result(o)
	167	-
	168	-
	169	-
	170	-# server loop
	171	-
	172	-with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	173	- s.bind((args.host, args.port))
	174	- s.listen(1)
	175	- logger.info('Listening on'+str((args.host, args.port)))
	176	- while True:
	177	- conn, addr = s.accept()
	178	- logger.info('Connected to client on {}'.format(addr))
	179	- connection = Connection(conn)
	180	- proc = ServerProcessor(connection, online, args.min_chunk_size)
	181	- proc.process()
	182	- conn.close()
	183	- logger.info('Connection to client closed')
	184	-logger.info('Connection closed, terminating.')

Add a comment

Open 0
Closed 0

List

...	...	@@ -12,6 +12,11 @@
12	12
13	13	![Demo Screenshot](src/demo.png)
14	14
	15	+## Code Origins
	16	+
	17	+This project reuses and extends code from the original Whisper Streaming repository:
	18	+- whisper_online.py: Contains code from whisper_streaming with the addition of the MLX Whisper backend for Apple Silicon, which is not present in the original repository.
	19	+- silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project.
15	20
16	21	## Installation
17	22

Delete comment