

Merge pull request #1 from QuentinFuxa/fast-api-web-interface-with-buffer
add fastapi server with live webm to pcm conversion and web page show…
@fc9dd80cfe43eb7065a6410c46b6621d02a4c866
--- README.md
+++ README.md
... | ... | @@ -208,6 +208,51 @@ |
208 | 208 |
|
209 | 209 |
- nc is netcat with server's host and port |
210 | 210 |
|
211 |
+## Live Transcription Web Interface |
|
212 |
+ |
|
213 |
+This repository also includes a **FastAPI server** and an **HTML/JavaScript client** for quick testing of live speech transcription in the browser. The client uses native WebSockets and the `MediaRecorder` API to capture microphone audio in **WebM** format and send it to the server—**no additional front-end framework** is required. |
|
214 |
+ |
|
215 |
+ |
|
216 |
+ |
|
217 |
+### How to Launch the Server |
|
218 |
+ |
|
219 |
+1. **Install Dependencies**: |
|
220 |
+ |
|
221 |
+ ```bash |
|
222 |
+ pip install -r requirements.txt |
|
223 |
+ ``` |
|
224 |
+ |
|
225 |
+2. **Run the FastAPI Server**: |
|
226 |
+ |
|
227 |
+ ```bash |
|
228 |
+ python whisper_fastapi_online_server.py --host 0.0.0.0 --port 8000 |
|
229 |
+ ``` |
|
230 |
+ |
|
231 |
+ - `--host` and `--port` let you specify the server’s IP/port. |
|
232 |
+ |
|
233 |
+3. **Open the Provided HTML**: |
|
234 |
+ |
|
235 |
+ - By default, the server root endpoint `/` serves a simple `live_transcription.html` page. |
|
236 |
+ - Open your browser at `http://localhost:8000` (or replace `localhost` and `8000` with whatever you specified). |
|
237 |
+ - The page uses vanilla JavaScript and the WebSocket API to capture your microphone and stream audio to the server in real time. |
|
238 |
+ |
|
239 |
+### How the Live Interface Works |
|
240 |
+ |
|
241 |
+- Once you **allow microphone access**, the page records small chunks of audio using the **MediaRecorder** API in **webm/opus** format. |
|
242 |
+- These chunks are sent over a **WebSocket** to the FastAPI endpoint at `/ws`. |
|
243 |
+- The Python server decodes `.webm` chunks on the fly using **FFmpeg** and streams them into **Whisper** for transcription. |
|
244 |
+- **Partial transcription** appears as soon as enough audio is processed. The “unvalidated” text is shown in **lighter or grey color** (i.e., an ‘aperçu’) to indicate it’s still buffered partial output. Once Whisper finalizes that segment, it’s displayed in normal text. |
|
245 |
+- You can watch the transcription update in near real time, ideal for demos, prototyping, or quick debugging. |
|
246 |
+ |
|
247 |
+### Deploying to a Remote Server |
|
248 |
+ |
|
249 |
+If you want to **deploy** this setup: |
|
250 |
+ |
|
251 |
+1. **Host the FastAPI app** behind a production-grade HTTP server (like **Uvicorn + Nginx** or Docker). |
|
252 |
+2. The **HTML/JS page** can be served by the same FastAPI app or a separate static host. |
|
253 |
+3. Users open the page in **Chrome/Firefox** (any modern browser that supports MediaRecorder + WebSocket). |
|
254 |
+ |
|
255 |
+No additional front-end libraries or frameworks are required. The WebSocket logic in `live_transcription.html` is minimal enough to adapt for your own custom UI or embed in other pages. |
|
211 | 256 |
|
212 | 257 |
## Background |
213 | 258 |
|
+++ src/demo.png
Binary file is not shown |
+++ src/live_transcription.html
... | ... | @@ -0,0 +1,111 @@ |
1 | +<!DOCTYPE html> | |
2 | +<html lang="en"> | |
3 | +<head> | |
4 | + <meta charset="UTF-8"> | |
5 | + <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
6 | + <title>Audio Transcription</title> | |
7 | + <style> | |
8 | + body { | |
9 | + font-family: 'Inter', sans-serif; | |
10 | + text-align: center; | |
11 | + margin: 20px; | |
12 | + } | |
13 | + #recordButton { | |
14 | + width: 80px; | |
15 | + height: 80px; | |
16 | + font-size: 36px; | |
17 | + border: none; | |
18 | + border-radius: 50%; | |
19 | + background-color: white; | |
20 | + cursor: pointer; | |
21 | + box-shadow: 0 0px 10px rgba(0, 0, 0, 0.2); | |
22 | + transition: background-color 0.3s ease, transform 0.2s ease; | |
23 | + } | |
24 | + #recordButton.recording { | |
25 | + background-color: #ff4d4d; | |
26 | + color: white; | |
27 | + } | |
28 | + #recordButton:active { | |
29 | + transform: scale(0.95); | |
30 | + } | |
31 | + #transcriptions { | |
32 | + margin-top: 20px; | |
33 | + font-size: 18px; | |
34 | + text-align: left; | |
35 | + } | |
36 | + .transcription { | |
37 | + display: inline; | |
38 | + color: black; | |
39 | + } | |
40 | + .buffer { | |
41 | + display: inline; | |
42 | + color: rgb(197, 197, 197); | |
43 | + } | |
44 | + </style> | |
45 | +</head> | |
46 | +<body> | |
47 | + <p id="status">Click to start transcription</p> | |
48 | + <button id="recordButton">🎙�</button> | |
49 | + <div id="transcriptions"></div> | |
50 | + | |
51 | + <script> | |
52 | + let isRecording = false, websocket, recorder; | |
53 | + | |
54 | + const statusText = document.getElementById("status"); | |
55 | + const recordButton = document.getElementById("recordButton"); | |
56 | + const transcriptionsDiv = document.getElementById("transcriptions"); | |
57 | + | |
58 | + let fullTranscription = ""; // Store confirmed transcription | |
59 | + | |
60 | + function setupWebSocket() { | |
61 | + websocket = new WebSocket("ws://localhost:8000/ws"); | |
62 | + websocket.onmessage = (event) => { | |
63 | + const data = JSON.parse(event.data); | |
64 | + const { transcription, buffer } = data; | |
65 | + | |
66 | + // Update confirmed transcription | |
67 | + fullTranscription += transcription; | |
68 | + | |
69 | + // Update the transcription display | |
70 | + transcriptionsDiv.innerHTML = ` | |
71 | + <span class="transcription">${fullTranscription}</span> | |
72 | + <span class="buffer">${buffer}</span> | |
73 | + `; | |
74 | + }; | |
75 | + } | |
76 | + | |
77 | + async function startRecording() { | |
78 | + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); | |
79 | + recorder = new MediaRecorder(stream, { mimeType: "audio/webm" }); | |
80 | + recorder.ondataavailable = (e) => websocket?.send(e.data); | |
81 | + recorder.start(3000); | |
82 | + isRecording = true; | |
83 | + updateUI(); | |
84 | + } | |
85 | + | |
86 | + function stopRecording() { | |
87 | + recorder?.stop(); | |
88 | + recorder = null; | |
89 | + isRecording = false; | |
90 | + websocket?.close(); | |
91 | + websocket = null; | |
92 | + updateUI(); | |
93 | + } | |
94 | + | |
95 | + async function toggleRecording() { | |
96 | + if (isRecording) stopRecording(); | |
97 | + else { | |
98 | + setupWebSocket(); | |
99 | + await startRecording(); | |
100 | + } | |
101 | + } | |
102 | + | |
103 | + function updateUI() { | |
104 | + recordButton.classList.toggle("recording", isRecording); | |
105 | + statusText.textContent = isRecording ? "Recording..." : "Click to start transcription"; | |
106 | + } | |
107 | + | |
108 | + recordButton.addEventListener("click", toggleRecording); | |
109 | + </script> | |
110 | +</body> | |
111 | +</html>(파일 끝에 줄바꿈 문자 없음) |
+++ whisper_fastapi_online_server.py
... | ... | @@ -0,0 +1,140 @@ |
1 | +import io | |
2 | +import argparse | |
3 | +import asyncio | |
4 | +import numpy as np | |
5 | +import ffmpeg | |
6 | + | |
7 | +from fastapi import FastAPI, WebSocket, WebSocketDisconnect | |
8 | +from fastapi.responses import HTMLResponse | |
9 | +from fastapi.middleware.cors import CORSMiddleware | |
10 | + | |
11 | +from whisper_online import asr_factory, add_shared_args | |
12 | + | |
13 | +app = FastAPI() | |
14 | +app.add_middleware( | |
15 | + CORSMiddleware, | |
16 | + allow_origins=["*"], | |
17 | + allow_credentials=True, | |
18 | + allow_methods=["*"], | |
19 | + allow_headers=["*"], | |
20 | +) | |
21 | + | |
22 | + | |
23 | +# Argument parsing | |
24 | +parser = argparse.ArgumentParser() | |
25 | +parser.add_argument("--host", type=str, default='localhost') | |
26 | +parser.add_argument("--port", type=int, default=8000) | |
27 | +parser.add_argument("--warmup-file", type=str, dest="warmup_file", | |
28 | + help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .") | |
29 | +add_shared_args(parser) | |
30 | +args = parser.parse_args() | |
31 | + | |
32 | +# Initialize Whisper | |
33 | +asr, online = asr_factory(args) | |
34 | + | |
35 | +# Load demo HTML for the root endpoint | |
36 | +with open("live_transcription.html", "r") as f: | |
37 | + html = f.read() | |
38 | + | |
39 | +@app.get("/") | |
40 | +async def get(): | |
41 | + return HTMLResponse(html) | |
42 | + | |
43 | +# Streaming constants | |
44 | +SAMPLE_RATE = 16000 | |
45 | +CHANNELS = 1 | |
46 | +SAMPLES_PER_SEC = SAMPLE_RATE * int(args.min_chunk_size) | |
47 | +BYTES_PER_SAMPLE = 2 # s16le = 2 bytes per sample | |
48 | +BYTES_PER_SEC = SAMPLES_PER_SEC * BYTES_PER_SAMPLE | |
49 | + | |
50 | +async def start_ffmpeg_decoder(): | |
51 | + """ | |
52 | + Start an FFmpeg process in async streaming mode that reads WebM from stdin | |
53 | + and outputs raw s16le PCM on stdout. Returns the process object. | |
54 | + """ | |
55 | + process = ( | |
56 | + ffmpeg | |
57 | + .input('pipe:0', format='webm') | |
58 | + .output('pipe:1', format='s16le', acodec='pcm_s16le', ac=CHANNELS, ar=str(SAMPLE_RATE)) | |
59 | + .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True) | |
60 | + ) | |
61 | + return process | |
62 | + | |
63 | +@app.websocket("/ws") | |
64 | +async def websocket_endpoint(websocket: WebSocket): | |
65 | + await websocket.accept() | |
66 | + print("WebSocket connection opened.") | |
67 | + | |
68 | + ffmpeg_process = await start_ffmpeg_decoder() | |
69 | + pcm_buffer = bytearray() | |
70 | + | |
71 | + # Continuously read decoded PCM from ffmpeg stdout in a background task | |
72 | + async def ffmpeg_stdout_reader(): | |
73 | + nonlocal pcm_buffer | |
74 | + loop = asyncio.get_event_loop() | |
75 | + while True: | |
76 | + try: | |
77 | + chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 4096) | |
78 | + if not chunk: # FFmpeg might have closed | |
79 | + print("FFmpeg stdout closed.") | |
80 | + break | |
81 | + | |
82 | + pcm_buffer.extend(chunk) | |
83 | + | |
84 | + # Process in 3-second batches | |
85 | + while len(pcm_buffer) >= BYTES_PER_SEC: | |
86 | + three_sec_chunk = pcm_buffer[:BYTES_PER_SEC] | |
87 | + del pcm_buffer[:BYTES_PER_SEC] | |
88 | + | |
89 | + # Convert int16 -> float32 | |
90 | + pcm_array = np.frombuffer(three_sec_chunk, dtype=np.int16).astype(np.float32) / 32768.0 | |
91 | + | |
92 | + # Send PCM data to Whisper | |
93 | + online.insert_audio_chunk(pcm_array) | |
94 | + transcription = online.process_iter() | |
95 | + buffer = online.to_flush(online.transcript_buffer.buffer) | |
96 | + | |
97 | + # Return partial transcription results to the client | |
98 | + await websocket.send_json({ | |
99 | + "transcription": transcription[2], | |
100 | + "buffer": buffer[2] | |
101 | + }) | |
102 | + except Exception as e: | |
103 | + print(f"Exception in ffmpeg_stdout_reader: {e}") | |
104 | + break | |
105 | + | |
106 | + print("Exiting ffmpeg_stdout_reader...") | |
107 | + | |
108 | + stdout_reader_task = asyncio.create_task(ffmpeg_stdout_reader()) | |
109 | + | |
110 | + try: | |
111 | + while True: | |
112 | + # Receive incoming WebM audio chunks from the client | |
113 | + message = await websocket.receive_bytes() | |
114 | + # Pass them to ffmpeg via stdin | |
115 | + ffmpeg_process.stdin.write(message) | |
116 | + ffmpeg_process.stdin.flush() | |
117 | + | |
118 | + except WebSocketDisconnect: | |
119 | + print("WebSocket connection closed.") | |
120 | + except Exception as e: | |
121 | + print(f"Error in websocket loop: {e}") | |
122 | + finally: | |
123 | + # Clean up ffmpeg and the reader task | |
124 | + try: | |
125 | + ffmpeg_process.stdin.close() | |
126 | + except: | |
127 | + pass | |
128 | + stdout_reader_task.cancel() | |
129 | + | |
130 | + try: | |
131 | + ffmpeg_process.stdout.close() | |
132 | + except: | |
133 | + pass | |
134 | + | |
135 | + ffmpeg_process.wait() | |
136 | + | |
137 | + | |
138 | +if __name__ == "__main__": | |
139 | + import uvicorn | |
140 | + uvicorn.run("whisper_fastapi_online_server:app", host=args.host, port=args.port, reload=True)(파일 끝에 줄바꿈 문자 없음) |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?