Commit @e4d0f762b15b6f16c2d58debb23a2fc95b411490 - yjyoon/whisper_streaming

Quentin Fuxa 2024-12-19

better buffer gestion

@e4d0f762b15b6f16c2d58debb23a2fc95b411490

d35e88e

e4d0f76

README.md

--- README.md

+++ README.md


     python whisper_fastapi_online_server.py --host 0.0.0.0 --port 8000
     ```
 
-    - `--host` and `--port` let you specify the server’s IP/port.  
+    - `--host` and `--port` let you specify the server’s IP/port. 
+    - `-min-chunk-size` sets the minimum chunk size for audio processing. Make sure this value aligns with the chunk size selected in the frontend. If not aligned, the system will work but may unnecessarily over-process audio data.
+    - For a full list of configurable options, run `python whisper_fastapi_online_server.py -h`
 
 4. **Open the Provided HTML**:
 

 
 If you want to **deploy** this setup:
 
-1. **Host the FastAPI app** behind a production-grade HTTP(S) server (like **Uvicorn + Nginx** or Docker).  
+1. **Host the FastAPI app** behind a production-grade HTTP(S) server (like **Uvicorn + Nginx** or Docker). If you use HTTPS, use "wss" instead of "ws" in WebSocket URL.
 2. The **HTML/JS page** can be served by the same FastAPI app or a separate static host.  
 3. Users open the page in **Chrome/Firefox** (any modern browser that supports MediaRecorder + WebSocket).  
 

d35e88e

e4d0f76

whisper_fastapi_online_server.py

--- whisper_fastapi_online_server.py

+++ whisper_fastapi_online_server.py


 )
 
 
-# Argument parsing
-parser = argparse.ArgumentParser()
-parser.add_argument("--host", type=str, default='localhost')
-parser.add_argument("--port", type=int, default=8000)
+parser = argparse.ArgumentParser(description="Whisper FastAPI Online Server")
+parser.add_argument("--host", type=str, default='localhost', help="The host address to bind the server to.")
+parser.add_argument("--port", type=int, default=8000, help="The port number to bind the server to.")
 parser.add_argument("--warmup-file", type=str, dest="warmup_file", 
         help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
 add_shared_args(parser)
 args = parser.parse_args()
 
-# Initialize Whisper
 asr, online = asr_factory(args)
 
 # Load demo HTML for the root endpoint
-with open("live_transcription.html", "r") as f:
+with open("src/live_transcription.html", "r") as f:
     html = f.read()
 
 @app.get("/")
 async def get():
     return HTMLResponse(html)
 
-# Streaming constants
 SAMPLE_RATE = 16000
 CHANNELS = 1
 SAMPLES_PER_SEC = SAMPLE_RATE * int(args.min_chunk_size)

 
     ffmpeg_process = await start_ffmpeg_decoder()
     pcm_buffer = bytearray()
-
     # Continuously read decoded PCM from ffmpeg stdout in a background task
     async def ffmpeg_stdout_reader():
         nonlocal pcm_buffer
         loop = asyncio.get_event_loop()
+        full_transcription = ""
         while True:
             try:
                 chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 4096)

 
                 pcm_buffer.extend(chunk)
 
-                # Process in 3-second batches
                 while len(pcm_buffer) >= BYTES_PER_SEC:
                     three_sec_chunk = pcm_buffer[:BYTES_PER_SEC]
                     del pcm_buffer[:BYTES_PER_SEC]

                     # Convert int16 -> float32
                     pcm_array = np.frombuffer(three_sec_chunk, dtype=np.int16).astype(np.float32) / 32768.0
 
-                    # Send PCM data to Whisper
                     online.insert_audio_chunk(pcm_array)
-                    transcription = online.process_iter()
-                    buffer = online.to_flush(online.transcript_buffer.buffer)
-
-                    # Return partial transcription results to the client
+                    transcription = online.process_iter()[2]
+                    if args.vac:
+                        buffer = online.online.to_flush(online.online.transcript_buffer.buffer)[2] # We need to access the underlying online object to get the buffer
+                    else:
+                        buffer = online.to_flush(online.transcript_buffer.buffer)[2]
+                    if buffer in full_transcription: # With VAC, the buffer is not updated until the next chunk is processed
+                        buffer = ""
                     await websocket.send_json({
-                        "transcription": transcription[2],
-                        "buffer": buffer[2]
+                        "transcription": transcription,
+                        "buffer": buffer
                     })
             except Exception as e:
                 print(f"Exception in ffmpeg_stdout_reader: {e}")

Add a comment

Open 0
Closed 0

List

...	...	@@ -68,7 +68,9 @@
68	68	python whisper_fastapi_online_server.py --host 0.0.0.0 --port 8000
69	69	```
70	70
71		- - `--host` and `--port` let you specify the server’s IP/port.
	71	+ - `--host` and `--port` let you specify the server’s IP/port.
	72	+ - `-min-chunk-size` sets the minimum chunk size for audio processing. Make sure this value aligns with the chunk size selected in the frontend. If not aligned, the system will work but may unnecessarily over-process audio data.
	73	+ - For a full list of configurable options, run `python whisper_fastapi_online_server.py -h`
72	74
73	75	4. Open the Provided HTML:
74	76
...	...	@@ -88,7 +90,7 @@
88	90
89	91	If you want to deploy this setup:
90	92
91		-1. Host the FastAPI app behind a production-grade HTTP(S) server (like Uvicorn + Nginx or Docker).
	93	+1. Host the FastAPI app behind a production-grade HTTP(S) server (like Uvicorn + Nginx or Docker). If you use HTTPS, use "wss" instead of "ws" in WebSocket URL.
92	94	2. The HTML/JS page can be served by the same FastAPI app or a separate static host.
93	95	3. Users open the page in Chrome/Firefox (any modern browser that supports MediaRecorder + WebSocket).
94	96

...	...	@@ -20,27 +20,24 @@
20	20	)
21	21
22	22
23		-# Argument parsing
24		-parser = argparse.ArgumentParser()
25		-parser.add_argument("--host", type=str, default='localhost')
26		-parser.add_argument("--port", type=int, default=8000)
	23	+parser = argparse.ArgumentParser(description="Whisper FastAPI Online Server")
	24	+parser.add_argument("--host", type=str, default='localhost', help="The host address to bind the server to.")
	25	+parser.add_argument("--port", type=int, default=8000, help="The port number to bind the server to.")
27	26	parser.add_argument("--warmup-file", type=str, dest="warmup_file",
28	27	help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
29	28	add_shared_args(parser)
30	29	args = parser.parse_args()
31	30
32		-# Initialize Whisper
33	31	asr, online = asr_factory(args)
34	32
35	33	# Load demo HTML for the root endpoint
36		-with open("live_transcription.html", "r") as f:
	34	+with open("src/live_transcription.html", "r") as f:
37	35	html = f.read()
38	36
39	37	@app.get("/")
40	38	async def get():
41	39	return HTMLResponse(html)
42	40
43		-# Streaming constants
44	41	SAMPLE_RATE = 16000
45	42	CHANNELS = 1
46	43	SAMPLES_PER_SEC = SAMPLE_RATE * int(args.min_chunk_size)
...	...	@@ -67,11 +64,11 @@
67	64
68	65	ffmpeg_process = await start_ffmpeg_decoder()
69	66	pcm_buffer = bytearray()
70		-
71	67	# Continuously read decoded PCM from ffmpeg stdout in a background task
72	68	async def ffmpeg_stdout_reader():
73	69	nonlocal pcm_buffer
74	70	loop = asyncio.get_event_loop()
	71	+ full_transcription = ""
75	72	while True:
76	73	try:
77	74	chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 4096)
...	...	@@ -81,7 +78,6 @@
81	78
82	79	pcm_buffer.extend(chunk)
83	80
84		- # Process in 3-second batches
85	81	while len(pcm_buffer) >= BYTES_PER_SEC:
86	82	three_sec_chunk = pcm_buffer[:BYTES_PER_SEC]
87	83	del pcm_buffer[:BYTES_PER_SEC]
...	...	@@ -89,15 +85,17 @@
89	85	# Convert int16 -> float32
90	86	pcm_array = np.frombuffer(three_sec_chunk, dtype=np.int16).astype(np.float32) / 32768.0
91	87
92		- # Send PCM data to Whisper
93	88	online.insert_audio_chunk(pcm_array)
94		- transcription = online.process_iter()
95		- buffer = online.to_flush(online.transcript_buffer.buffer)
96		-
97		- # Return partial transcription results to the client
	89	+ transcription = online.process_iter()[2]
	90	+ if args.vac:
	91	+ buffer = online.online.to_flush(online.online.transcript_buffer.buffer)[2] # We need to access the underlying online object to get the buffer
	92	+ else:
	93	+ buffer = online.to_flush(online.transcript_buffer.buffer)[2]
	94	+ if buffer in full_transcription: # With VAC, the buffer is not updated until the next chunk is processed
	95	+ buffer = ""
98	96	await websocket.send_json({
99		- "transcription": transcription[2],
100		- "buffer": buffer[2]
	97	+ "transcription": transcription,
	98	+ "buffer": buffer
101	99	})
102	100	except Exception as e:
103	101	print(f"Exception in ffmpeg_stdout_reader: {e}")

Delete comment