
+++ README.md
... | ... | @@ -0,0 +1,1 @@ |
1 | +# whisper_client |
+++ client.html
... | ... | @@ -0,0 +1,483 @@ |
1 | +<!DOCTYPE html> | |
2 | +<html lang="en"> | |
3 | +<head> | |
4 | + <meta charset="UTF-8"> | |
5 | + <title>Real-Time Audio Transcription with VAD and Volume Meter</title> | |
6 | + <style> | |
7 | + body { | |
8 | + font-family: Arial, sans-serif; | |
9 | + margin: 40px; | |
10 | + background-color: #f5f5f5; | |
11 | + } | |
12 | + h1 { | |
13 | + text-align: center; | |
14 | + } | |
15 | + #controls { | |
16 | + text-align: center; | |
17 | + margin-bottom: 20px; | |
18 | + } | |
19 | + #recordBtn { | |
20 | + padding: 15px 30px; | |
21 | + font-size: 18px; | |
22 | + border: none; | |
23 | + border-radius: 5px; | |
24 | + background-color: #28a745; /* Green */ | |
25 | + color: white; | |
26 | + cursor: pointer; | |
27 | + transition: background-color 0.3s ease; | |
28 | + } | |
29 | + #recordBtn.recording { | |
30 | + background-color: #dc3545; /* Red */ | |
31 | + } | |
32 | + #transcriptions { | |
33 | + max-width: 800px; | |
34 | + margin: 0 auto; | |
35 | + padding: 20px; | |
36 | + background-color: white; | |
37 | + border-radius: 8px; | |
38 | + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); | |
39 | + height: 600px; | |
40 | + overflow-y: auto; | |
41 | + white-space: pre-wrap; | |
42 | + font-size: 16px; | |
43 | + } | |
44 | + .transcription { | |
45 | + margin-bottom: 10px; | |
46 | + } | |
47 | + .notice { | |
48 | + color: #dc3545; /* Red */ | |
49 | + font-style: italic; | |
50 | + } | |
51 | + /* Volume Meter Styles */ | |
52 | + #volumeMeter { | |
53 | + width: 300px; | |
54 | + height: 30px; | |
55 | + background-color: #e0e0e0; | |
56 | + border-radius: 15px; | |
57 | + overflow: hidden; | |
58 | + margin: 20px auto; | |
59 | + position: relative; | |
60 | + } | |
61 | + #volumeBar { | |
62 | + height: 100%; | |
63 | + width: 0%; | |
64 | + background-color: #28a745; | |
65 | + transition: width 0.1s ease, background-color 0.1s ease; | |
66 | + } | |
67 | + </style> | |
68 | +</head> | |
69 | +<body> | |
70 | + <h1>Real-Time Audio Transcription with VAD and Volume Meter</h1> | |
71 | + <div id="controls"> | |
72 | + <button id="recordBtn">Start Recording</button> | |
73 | + </div> | |
74 | + <!-- Volume Meter --> | |
75 | + <div id="volumeMeter"> | |
76 | + <div id="volumeBar"></div> | |
77 | + </div> | |
78 | + <div id="transcriptions"></div> | |
79 | + | |
80 | + <!-- Include ONNX Runtime Web --> | |
81 | + <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"></script> | |
82 | + <!-- Include VAD-Web --> | |
83 | + <script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script> | |
84 | + | |
85 | + <script> | |
86 | + // Elements | |
87 | + const recordBtn = document.getElementById('recordBtn'); | |
88 | + const transcriptionsDiv = document.getElementById('transcriptions'); | |
89 | + const volumeBar = document.getElementById('volumeBar'); | |
90 | + | |
91 | + // State Variables | |
92 | + let isRecording = false; | |
93 | + let vadInstance = null; // Renamed to avoid conflict | |
94 | + let ws = null; | |
95 | + let audioContext = null; | |
96 | + let analyser = null; | |
97 | + let microphoneStream = null; | |
98 | + let dataArray = null; | |
99 | + let animationId = null; | |
100 | + let reconnectInterval = 3000; // 3 seconds | |
101 | + let shouldReconnect = false; // Flag to control reconnection | |
102 | + | |
103 | + // Configuration | |
104 | + const WS_ENDPOINT = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"; // Ensure this is correct | |
105 | + | |
106 | + // Buffer to hold incoming data for JSON parsing | |
107 | + let incomingBuffer = ''; | |
108 | + | |
109 | + // Utility Functions | |
110 | + | |
111 | + /** | |
112 | + * Logs transcription text with colored words based on probability. | |
113 | + * @param {Array} words - Array of word objects with 'word' and 'probability'. | |
114 | + */ | |
115 | + function logTranscription(words) { | |
116 | + const transcriptionLine = document.createElement('div'); | |
117 | + transcriptionLine.classList.add('transcription'); | |
118 | + | |
119 | + words.forEach(wordObj => { | |
120 | + const span = document.createElement('span'); | |
121 | + span.textContent = wordObj.word + ' '; // Add space after each word | |
122 | + | |
123 | + // Calculate hue: 0 (red) to 240 (blue) | |
124 | + const hue = wordObj.probability * 240; | |
125 | + span.style.color = `hsl(${hue}, 100%, 50%)`; | |
126 | + | |
127 | + transcriptionLine.appendChild(span); | |
128 | + }); | |
129 | + | |
130 | + transcriptionsDiv.appendChild(transcriptionLine); | |
131 | + transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight; | |
132 | + } | |
133 | + | |
134 | + /** | |
135 | + * Logs notice messages (e.g., connection status, errors). | |
136 | + * @param {string} text - The notice text to display. | |
137 | + */ | |
138 | + function logNotice(text) { | |
139 | + const p = document.createElement('p'); | |
140 | + p.classList.add('notice'); | |
141 | + p.textContent = text; | |
142 | + transcriptionsDiv.appendChild(p); | |
143 | + transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight; | |
144 | + } | |
145 | + | |
146 | + /** | |
147 | + * Converts Float32 audio data to Int16 PCM format. | |
148 | + * @param {Float32Array} buffer - The audio buffer in Float32 format. | |
149 | + * @returns {Int16Array} - The audio buffer in Int16 format. | |
150 | + */ | |
151 | + function convertFloat32ToInt16(buffer) { | |
152 | + let l = buffer.length; | |
153 | + const buf = new Int16Array(l); | |
154 | + while (l--) { | |
155 | + buf[l] = Math.min(1, buffer[l]) * 0x7FFF; | |
156 | + } | |
157 | + return buf; | |
158 | + } | |
159 | + | |
160 | + /** | |
161 | + * Extracts JSON objects from a concatenated string. | |
162 | + * @param {string} buffer - The concatenated JSON string. | |
163 | + * @returns {Array} - An array of parsed JSON objects. | |
164 | + */ | |
165 | + function extractJSONObjects(buffer) { | |
166 | + const objects = []; | |
167 | + let braceStack = 0; | |
168 | + let inString = false; | |
169 | + let escape = false; | |
170 | + let lastSplit = 0; | |
171 | + | |
172 | + for (let i = 0; i < buffer.length; i++) { | |
173 | + const char = buffer[i]; | |
174 | + | |
175 | + if (char === '"' && !escape) { | |
176 | + inString = !inString; | |
177 | + } | |
178 | + | |
179 | + if (!inString) { | |
180 | + if (char === '{') { | |
181 | + braceStack++; | |
182 | + } else if (char === '}') { | |
183 | + braceStack--; | |
184 | + if (braceStack === 0) { | |
185 | + const jsonString = buffer.slice(lastSplit, i + 1); | |
186 | + try { | |
187 | + const jsonObj = JSON.parse(jsonString); | |
188 | + objects.push(jsonObj); | |
189 | + } catch (e) { | |
190 | + console.error('Failed to parse JSON:', e); | |
191 | + } | |
192 | + lastSplit = i + 1; | |
193 | + } | |
194 | + } | |
195 | + } | |
196 | + | |
197 | + // Handle escape characters | |
198 | + if (char === '\\' && !escape) { | |
199 | + escape = true; | |
200 | + } else { | |
201 | + escape = false; | |
202 | + } | |
203 | + } | |
204 | + | |
205 | + // Return any remaining buffer that wasn't parsed | |
206 | + incomingBuffer = buffer.slice(lastSplit); | |
207 | + return objects; | |
208 | + } | |
209 | + | |
210 | + // WebSocket Handlers | |
211 | + | |
212 | + /** | |
213 | + * Sets up the WebSocket connection and defines event handlers. | |
214 | + */ | |
215 | + function setupWebSocket() { | |
216 | + ws = new WebSocket(WS_ENDPOINT); | |
217 | + ws.binaryType = 'arraybuffer'; | |
218 | + | |
219 | + ws.onopen = () => { | |
220 | + console.log('WebSocket connection opened.'); | |
221 | + logNotice("WebSocket connection established."); | |
222 | + }; | |
223 | + | |
224 | + ws.onmessage = (event) => { | |
225 | + let messageData = ''; | |
226 | + | |
227 | + if (typeof event.data === 'string') { | |
228 | + messageData = event.data; | |
229 | + } else if (event.data instanceof ArrayBuffer) { | |
230 | + const decoder = new TextDecoder('utf-8'); | |
231 | + messageData = decoder.decode(event.data); | |
232 | + } else { | |
233 | + console.warn('Unsupported message format:', event.data); | |
234 | + return; | |
235 | + } | |
236 | + | |
237 | + // Append incoming data to buffer | |
238 | + incomingBuffer += messageData; | |
239 | + | |
240 | + // Extract JSON objects | |
241 | + const jsonObjects = extractJSONObjects(incomingBuffer); | |
242 | + | |
243 | + // Process each JSON object | |
244 | + jsonObjects.forEach(obj => { | |
245 | + if (obj.task === "transcribe" && Array.isArray(obj.words)) { | |
246 | + logTranscription(obj.words); | |
247 | + } | |
248 | + }); | |
249 | + }; | |
250 | + | |
251 | + ws.onclose = (event) => { | |
252 | + console.log('WebSocket connection closed:', event); | |
253 | + logNotice("WebSocket connection closed."); | |
254 | + ws = null; | |
255 | + | |
256 | + if (isRecording && shouldReconnect) { | |
257 | + logNotice("Attempting to reconnect..."); | |
258 | + setTimeout(() => { | |
259 | + setupWebSocket(); | |
260 | + }, reconnectInterval); | |
261 | + } else if (isRecording) { | |
262 | + logNotice("Transcription session ended."); | |
263 | + stopRecording(true); // true indicates server-initiated stop | |
264 | + } | |
265 | + }; | |
266 | + | |
267 | + ws.onerror = (error) => { | |
268 | + console.error('WebSocket error:', error); | |
269 | + logNotice("WebSocket encountered an error."); | |
270 | + }; | |
271 | + } | |
272 | + | |
273 | + // Voice Activity Detection Setup | |
274 | + | |
275 | + /** | |
276 | + * Initializes the Voice Activity Detector (VAD) using Silero VAD. | |
277 | + */ | |
278 | + async function initializeVAD(stream) { | |
279 | + try { | |
280 | + vadInstance = await vad.MicVAD.new({ | |
281 | + stream: stream, // Pass the existing MediaStream to avoid multiple microphone accesses | |
282 | + onSpeechStart: () => { | |
283 | + console.log("Speech start detected"); | |
284 | + logNotice("Speech detected..."); | |
285 | + }, | |
286 | + onSpeechEnd: (audio) => { | |
287 | + console.log("Speech end detected"); | |
288 | + logNotice("Sending speech segment to server..."); | |
289 | + | |
290 | + // Convert Float32Array to Int16Array | |
291 | + const int16Audio = convertFloat32ToInt16(audio); | |
292 | + | |
293 | + // Send the audio buffer via WebSocket | |
294 | + if (ws && ws.readyState === WebSocket.OPEN) { | |
295 | + ws.send(int16Audio.buffer); | |
296 | + } else { | |
297 | + console.warn('WebSocket is not open. Cannot send audio.'); | |
298 | + logNotice("WebSocket is not open. Audio segment not sent."); | |
299 | + } | |
300 | + } | |
301 | + }); | |
302 | + } catch (error) { | |
303 | + console.error('Error initializing VAD:', error); | |
304 | + logNotice("Error initializing Voice Activity Detection."); | |
305 | + } | |
306 | + } | |
307 | + | |
308 | + // Volume Meter Setup | |
309 | + | |
310 | + /** | |
311 | + * Sets up the volume meter using the Web Audio API. | |
312 | + */ | |
313 | + async function setupVolumeMeter(stream) { | |
314 | + try { | |
315 | + // Initialize AudioContext | |
316 | + audioContext = new (window.AudioContext || window.webkitAudioContext)(); | |
317 | + | |
318 | + // Create MediaStreamSource from the existing stream | |
319 | + microphoneStream = audioContext.createMediaStreamSource(stream); | |
320 | + | |
321 | + // Create AnalyserNode | |
322 | + analyser = audioContext.createAnalyser(); | |
323 | + analyser.fftSize = 512; | |
324 | + const bufferLength = analyser.frequencyBinCount; | |
325 | + dataArray = new Uint8Array(bufferLength); | |
326 | + | |
327 | + // Connect microphone to analyser | |
328 | + microphoneStream.connect(analyser); | |
329 | + | |
330 | + // Start visualizing | |
331 | + visualize(); | |
332 | + } catch (error) { | |
333 | + console.error('Error setting up volume meter:', error); | |
334 | + logNotice("Error setting up volume meter."); | |
335 | + } | |
336 | + } | |
337 | + | |
338 | + /** | |
339 | + * Visualizes the volume level on the volume meter. | |
340 | + */ | |
341 | + function visualize() { | |
342 | + const updateVolume = () => { | |
343 | + analyser.getByteFrequencyData(dataArray); | |
344 | + let sum = 0; | |
345 | + for (let i = 0; i < dataArray.length; i++) { | |
346 | + sum += dataArray[i]; | |
347 | + } | |
348 | + const average = sum / dataArray.length; | |
349 | + const volume = average / 255; // Normalize to [0,1] | |
350 | + | |
351 | + // Update the volume bar width | |
352 | + volumeBar.style.width = `${volume * 100}%`; | |
353 | + | |
354 | + // Change color based on volume level (green to red) | |
355 | + const hue = (1 - volume) * 120; // 120 (green) to 0 (red) | |
356 | + volumeBar.style.backgroundColor = `hsl(${hue}, 100%, 50%)`; | |
357 | + | |
358 | + animationId = requestAnimationFrame(updateVolume); | |
359 | + }; | |
360 | + | |
361 | + updateVolume(); | |
362 | + } | |
363 | + | |
364 | + /** | |
365 | + * Stops the volume meter visualization. | |
366 | + */ | |
367 | + function stopVolumeMeter() { | |
368 | + if (animationId) { | |
369 | + cancelAnimationFrame(animationId); | |
370 | + animationId = null; | |
371 | + } | |
372 | + if (volumeBar) { | |
373 | + volumeBar.style.width = '0%'; | |
374 | + volumeBar.style.backgroundColor = '#28a745'; // Reset to green | |
375 | + } | |
376 | + if (analyser) { | |
377 | + analyser.disconnect(); | |
378 | + analyser = null; | |
379 | + } | |
380 | + if (microphoneStream) { | |
381 | + microphoneStream.disconnect(); | |
382 | + microphoneStream = null; | |
383 | + } | |
384 | + if (audioContext) { | |
385 | + audioContext.close(); | |
386 | + audioContext = null; | |
387 | + } | |
388 | + } | |
389 | + | |
390 | + // Recording Control Functions | |
391 | + | |
392 | + /** | |
393 | + * Starts the Voice Activity Detection, Volume Meter, and WebSocket connection. | |
394 | + */ | |
395 | + async function startRecording() { | |
396 | + try { | |
397 | + // Request microphone access once | |
398 | + const stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false }); | |
399 | + | |
400 | + // Set up Volume Meter | |
401 | + await setupVolumeMeter(stream); | |
402 | + | |
403 | + // Initialize VAD with the same stream | |
404 | + await initializeVAD(stream); | |
405 | + | |
406 | + // Set up WebSocket | |
407 | + shouldReconnect = true; // Enable reconnection attempts | |
408 | + setupWebSocket(); | |
409 | + | |
410 | + // Start VAD | |
411 | + if (vadInstance) { | |
412 | + vadInstance.start(); | |
413 | + } | |
414 | + | |
415 | + // Update UI | |
416 | + isRecording = true; | |
417 | + recordBtn.textContent = 'Stop Recording'; | |
418 | + recordBtn.classList.add('recording'); | |
419 | + logNotice("Recording started. Speak into your microphone."); | |
420 | + } catch (error) { | |
421 | + console.error('Error starting recording:', error); | |
422 | + logNotice("Error starting recording. Please try again."); | |
423 | + } | |
424 | + } | |
425 | + | |
426 | + /** | |
427 | + * Stops the Voice Activity Detection, Volume Meter, and cleans up resources. | |
428 | + * @param {boolean} serverInitiated - Indicates if the stop was triggered by the server. | |
429 | + */ | |
430 | + function stopRecording(serverInitiated = false) { | |
431 | + if (!isRecording) return; | |
432 | + | |
433 | + // Stop VAD | |
434 | + if (vadInstance) { | |
435 | + vadInstance.pause(); | |
436 | + vadInstance = null; | |
437 | + } | |
438 | + | |
439 | + // Stop Volume Meter | |
440 | + stopVolumeMeter(); | |
441 | + | |
442 | + // Prevent reconnection if stopping manually | |
443 | + if (!serverInitiated) { | |
444 | + shouldReconnect = false; | |
445 | + } | |
446 | + | |
447 | + // Close WebSocket if not server-initiated | |
448 | + if (!serverInitiated && ws && ws.readyState === WebSocket.OPEN) { | |
449 | + ws.send(JSON.stringify({ action: "terminate" })); | |
450 | + logNotice("Termination signal sent to server."); | |
451 | + } | |
452 | + | |
453 | + // Close WebSocket | |
454 | + if (ws) { | |
455 | + ws.close(); | |
456 | + ws = null; | |
457 | + } | |
458 | + | |
459 | + // Reset recording state | |
460 | + isRecording = false; | |
461 | + recordBtn.textContent = 'Start Recording'; | |
462 | + recordBtn.classList.remove('recording'); | |
463 | + logNotice("Recording stopped."); | |
464 | + } | |
465 | + | |
466 | + // Button Event Listener | |
467 | + | |
468 | + /** | |
469 | + * Toggles recording state when the record button is clicked. | |
470 | + */ | |
471 | + recordBtn.addEventListener('click', () => { | |
472 | + if (!isRecording) { | |
473 | + startRecording().catch(error => { | |
474 | + console.error('Error starting recording:', error); | |
475 | + logNotice("Error starting recording. Please try again."); | |
476 | + }); | |
477 | + } else { | |
478 | + stopRecording(); | |
479 | + } | |
480 | + }); | |
481 | + </script> | |
482 | +</body> | |
483 | +</html> |
+++ client_with_openAI.html
... | ... | @@ -0,0 +1,643 @@ |
1 | +<!DOCTYPE html> | |
2 | +<html lang="en"> | |
3 | +<head> | |
4 | + <meta charset="UTF-8"> | |
5 | + <title>Real-Time Audio Transcription with VAD and Volume Meter</title> | |
6 | + <style> | |
7 | + /* Your existing CSS styles */ | |
8 | + body { | |
9 | + --indicator-color: black; | |
10 | + background: radial-gradient(black 55%, var(--indicator-color)); | |
11 | + min-height: 100vh; | |
12 | + color: white; | |
13 | + margin: 0; | |
14 | + font-family: Arial, sans-serif; | |
15 | + } | |
16 | + h1 { | |
17 | + text-align: center; | |
18 | + margin-top: 20px; | |
19 | + } | |
20 | + #controls { | |
21 | + text-align: center; | |
22 | + margin: 20px; | |
23 | + } | |
24 | + #toggle_vad_button { | |
25 | + padding: 10px 20px; | |
26 | + font-size: 16px; | |
27 | + border: none; | |
28 | + border-radius: 5px; | |
29 | + background-color: #28a745; /* Green */ | |
30 | + color: white; | |
31 | + cursor: pointer; | |
32 | + transition: background-color 0.3s ease; | |
33 | + } | |
34 | + #toggle_vad_button.recording { | |
35 | + background-color: #dc3545; /* Red */ | |
36 | + } | |
37 | + #indicator { | |
38 | + text-align: center; | |
39 | + margin: 10px; | |
40 | + font-size: 18px; | |
41 | + } | |
42 | + #playlist { | |
43 | + max-width: 800px; | |
44 | + margin: 0 auto; | |
45 | + padding: 20px; | |
46 | + background-color: rgba(255, 255, 255, 0.1); | |
47 | + border-radius: 8px; | |
48 | + height: 400px; | |
49 | + overflow-y: scroll; | |
50 | + list-style: none; | |
51 | + padding-left: 0; | |
52 | + } | |
53 | + #playlist li { | |
54 | + margin-bottom: 10px; | |
55 | + opacity: 0; | |
56 | + animation: fadeIn 1s forwards; | |
57 | + } | |
58 | + #playlist li.newItem { | |
59 | + border-left: 4px solid #28a745; | |
60 | + padding-left: 10px; | |
61 | + } | |
62 | + .transcription { | |
63 | + color: white; | |
64 | + font-size: 16px; | |
65 | + } | |
66 | + .notice { | |
67 | + color: #dc3545; /* Red */ | |
68 | + font-style: italic; | |
69 | + } | |
70 | + @keyframes fadeIn { | |
71 | + to { | |
72 | + opacity: 1; | |
73 | + } | |
74 | + } | |
75 | + </style> | |
76 | +</head> | |
77 | +<body> | |
78 | + <h1>Real-Time Audio Transcription with VAD and Volume Meter</h1> | |
79 | + <div id="controls"> | |
80 | + <button id="toggle_vad_button" onclick="window.toggleVAD()" disabled>START VAD</button> | |
81 | + </div> | |
82 | + <div id="indicator">VAD is <span style="color: red">LOADING</span></div> | |
83 | + <ol id="playlist" reversed></ol> | |
84 | + | |
85 | + <!-- Include ONNX Runtime Web --> | |
86 | + <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"></script> | |
87 | + <!-- Include VAD-Web --> | |
88 | + <script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script> | |
89 | + | |
90 | + <script type="module"> | |
91 | + import { interpolateInferno } from "https://cdn.skypack.dev/d3-scale-chromatic@3"; | |
92 | + | |
93 | + // Elements | |
94 | + const recordBtn = document.getElementById('toggle_vad_button'); | |
95 | + const transcriptionsDiv = document.getElementById('playlist'); | |
96 | + const indicator = document.getElementById('indicator'); | |
97 | + | |
98 | + // State Variables | |
99 | + let isRecording = false; | |
100 | + let vadInstance = null; | |
101 | + let audioContext = null; | |
102 | + let analyser = null; | |
103 | + let microphoneStream = null; | |
104 | + let dataArray = null; | |
105 | + let animationId = null; | |
106 | + let isSpeaking = false; | |
107 | + let audioBuffer = []; | |
108 | + let sendAudioInterval = null; | |
109 | + const SEND_INTERVAL_MS = 1000; // 1 second | |
110 | + let ws = null; | |
111 | + let incomingBuffer = ''; | |
112 | + let fullTranscription = ''; // To accumulate full transcription | |
113 | + | |
114 | + // Configuration | |
115 | + const WS_ENDPOINT = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"; // Ensure this is correct | |
116 | + const BACKEND_UPLOAD_URL = "http://localhost:3000/upload-audio"; // Replace with your backend URL | |
117 | + const BACKEND_LLM_URL = "http://localhost:3000/process-transcription"; // Replace with your backend URL | |
118 | + | |
119 | + // Utility Functions | |
120 | + | |
121 | + /** | |
122 | + * Logs transcription text with colored words based on probability. | |
123 | + * @param {Array} words - Array of word objects with 'word' and 'probability'. | |
124 | + */ | |
125 | + function logTranscription(words) { | |
126 | + const transcriptionLine = document.createElement('div'); | |
127 | + transcriptionLine.classList.add('transcription'); | |
128 | + | |
129 | + words.forEach(wordObj => { | |
130 | + const span = document.createElement('span'); | |
131 | + span.textContent = wordObj.word + ' '; // Add space after each word | |
132 | + | |
133 | + // Calculate hue: 0 (red) to 240 (blue) | |
134 | + const hue = wordObj.probability * 240; | |
135 | + span.style.color = `hsl(${hue}, 100%, 50%)`; | |
136 | + | |
137 | + transcriptionLine.appendChild(span); | |
138 | + fullTranscription += wordObj.word + ' '; | |
139 | + }); | |
140 | + | |
141 | + transcriptionsDiv.prepend(transcriptionLine); | |
142 | + transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight; | |
143 | + } | |
144 | + | |
145 | + /** | |
146 | + * Logs notice messages (e.g., connection status, errors). | |
147 | + * @param {string} text - The notice text to display. | |
148 | + */ | |
149 | + function logNotice(text) { | |
150 | + const p = document.createElement('p'); | |
151 | + p.classList.add('notice'); | |
152 | + p.textContent = text; | |
153 | + transcriptionsDiv.prepend(p); | |
154 | + transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight; | |
155 | + } | |
156 | + | |
157 | + /** | |
158 | + * Converts Float32 audio data to Int16 PCM format. | |
159 | + * @param {Float32Array} buffer - The audio buffer in Float32 format. | |
160 | + * @returns {Int16Array} - The audio buffer in Int16 format. | |
161 | + */ | |
162 | + function convertFloat32ToInt16(buffer) { | |
163 | + let l = buffer.length; | |
164 | + const buf = new Int16Array(l); | |
165 | + while (l--) { | |
166 | + buf[l] = Math.min(1, buffer[l]) * 0x7FFF; | |
167 | + } | |
168 | + return buf; | |
169 | + } | |
170 | + | |
171 | + /** | |
172 | + * Extracts JSON objects from a concatenated string. | |
173 | + * @param {string} buffer - The concatenated JSON string. | |
174 | + * @returns {Array} - An array of parsed JSON objects. | |
175 | + */ | |
176 | + function extractJSONObjects(buffer) { | |
177 | + const objects = []; | |
178 | + let braceStack = 0; | |
179 | + let inString = false; | |
180 | + let escape = false; | |
181 | + let lastSplit = 0; | |
182 | + | |
183 | + for (let i = 0; i < buffer.length; i++) { | |
184 | + const char = buffer[i]; | |
185 | + | |
186 | + if (char === '"' && !escape) { | |
187 | + inString = !inString; | |
188 | + } | |
189 | + | |
190 | + if (!inString) { | |
191 | + if (char === '{') { | |
192 | + braceStack++; | |
193 | + } else if (char === '}') { | |
194 | + braceStack--; | |
195 | + if (braceStack === 0) { | |
196 | + const jsonString = buffer.slice(lastSplit, i + 1); | |
197 | + try { | |
198 | + const jsonObj = JSON.parse(jsonString); | |
199 | + objects.push(jsonObj); | |
200 | + } catch (e) { | |
201 | + console.error('Failed to parse JSON:', e); | |
202 | + } | |
203 | + lastSplit = i + 1; | |
204 | + } | |
205 | + } | |
206 | + } | |
207 | + | |
208 | + // Handle escape characters | |
209 | + if (char === '\\' && !escape) { | |
210 | + escape = true; | |
211 | + } else { | |
212 | + escape = false; | |
213 | + } | |
214 | + } | |
215 | + | |
216 | + // Return any remaining buffer that wasn't parsed | |
217 | + incomingBuffer = buffer.slice(lastSplit); | |
218 | + return objects; | |
219 | + } | |
220 | + | |
221 | + // WebSocket Handlers | |
222 | + | |
223 | + /** | |
224 | + * Sets up the WebSocket connection and defines event handlers. | |
225 | + */ | |
226 | + function setupWebSocket() { | |
227 | + ws = new WebSocket(WS_ENDPOINT); | |
228 | + ws.binaryType = 'arraybuffer'; | |
229 | + | |
230 | + ws.onopen = () => { | |
231 | + console.log('WebSocket connection opened.'); | |
232 | + logNotice("WebSocket connection established."); | |
233 | + }; | |
234 | + | |
235 | + ws.onmessage = (event) => { | |
236 | + let messageData = ''; | |
237 | + | |
238 | + if (typeof event.data === 'string') { | |
239 | + messageData = event.data; | |
240 | + } else if (event.data instanceof ArrayBuffer) { | |
241 | + const decoder = new TextDecoder('utf-8'); | |
242 | + messageData = decoder.decode(event.data); | |
243 | + } else { | |
244 | + console.warn('Unsupported message format:', event.data); | |
245 | + return; | |
246 | + } | |
247 | + | |
248 | + // Append incoming data to buffer | |
249 | + incomingBuffer += messageData; | |
250 | + | |
251 | + // Extract JSON objects | |
252 | + const jsonObjects = extractJSONObjects(incomingBuffer); | |
253 | + | |
254 | + // Process each JSON object | |
255 | + jsonObjects.forEach(obj => { | |
256 | + if (obj.task === "transcribe" && Array.isArray(obj.words)) { | |
257 | + logTranscription(obj.words); | |
258 | + } | |
259 | + }); | |
260 | + }; | |
261 | + | |
262 | + ws.onclose = (event) => { | |
263 | + console.log('WebSocket connection closed:', event); | |
264 | + logNotice("WebSocket connection closed."); | |
265 | + ws = null; | |
266 | + | |
267 | + if (isRecording && shouldReconnect) { | |
268 | + logNotice("Attempting to reconnect..."); | |
269 | + setTimeout(() => { | |
270 | + setupWebSocket(); | |
271 | + }, reconnectInterval); | |
272 | + } else if (isRecording) { | |
273 | + logNotice("Transcription session ended."); | |
274 | + stopRecording(true); // true indicates server-initiated stop | |
275 | + } | |
276 | + }; | |
277 | + | |
278 | + ws.onerror = (error) => { | |
279 | + console.error('WebSocket error:', error); | |
280 | + logNotice("WebSocket encountered an error."); | |
281 | + }; | |
282 | + } | |
283 | + | |
284 | + // Voice Activity Detection Setup | |
285 | + | |
286 | + /** | |
287 | + * Initializes the Voice Activity Detector (VAD) using MicVAD. | |
288 | + */ | |
289 | + async function initializeVAD(stream) { | |
290 | + try { | |
291 | + vadInstance = await vad.MicVAD.new({ | |
292 | + stream: stream, // Pass the existing MediaStream to avoid multiple microphone accesses | |
293 | + onSpeechStart: () => { | |
294 | + console.log("Speech start detected"); | |
295 | + logNotice("Speech detected..."); | |
296 | + | |
297 | + isSpeaking = true; | |
298 | + audioBuffer = []; // Reset buffer | |
299 | + | |
300 | + // Start timer to send audio every second | |
301 | + sendAudioInterval = setInterval(sendAudio, SEND_INTERVAL_MS); | |
302 | + }, | |
303 | + onSpeechEnd: (audio) => { | |
304 | + console.log("Speech end detected"); | |
305 | + logNotice("Sending final speech segment to server..."); | |
306 | + | |
307 | + isSpeaking = false; | |
308 | + | |
309 | + // Send any remaining audio | |
310 | + sendAudio(); | |
311 | + | |
312 | + // Stop the timer | |
313 | + if (sendAudioInterval) { | |
314 | + clearInterval(sendAudioInterval); | |
315 | + sendAudioInterval = null; | |
316 | + } | |
317 | + | |
318 | + // Optionally, send the final `audio` provided by the callback | |
319 | + // depending on your application's needs | |
320 | + // Example: | |
321 | + // sendFinalAudio(audio); | |
322 | + }, | |
323 | + onFrameProcessed: (probabilities, frame) => { | |
324 | + const indicatorColor = interpolateInferno(probabilities.isSpeech / 2); | |
325 | + document.body.style.setProperty("--indicator-color", indicatorColor); | |
326 | + | |
327 | + if (isSpeaking) { | |
328 | + audioBuffer.push(frame); | |
329 | + } | |
330 | + }, | |
331 | + }); | |
332 | + | |
333 | + window.vadInstance = vadInstance; | |
334 | + | |
335 | + // Start VAD listening | |
336 | + vadInstance.start(); | |
337 | + isRecording = true; | |
338 | + recordBtn.textContent = 'STOP VAD'; | |
339 | + recordBtn.classList.add('recording'); | |
340 | + logNotice("Recording started. Speak into your microphone."); | |
341 | + } catch (error) { | |
342 | + console.error('Error initializing VAD:', error); | |
343 | + logNotice("Error initializing Voice Activity Detection."); | |
344 | + } | |
345 | + } | |
346 | + | |
347 | + // Volume Meter Setup (Optional, based on your requirements) | |
348 | + | |
349 | + /** | |
350 | + * Sets up the volume meter using the Web Audio API. | |
351 | + */ | |
352 | + async function setupVolumeMeter(stream) { | |
353 | + try { | |
354 | + // Initialize AudioContext | |
355 | + audioContext = new (window.AudioContext || window.webkitAudioContext)(); | |
356 | + | |
357 | + // Create MediaStreamSource from the existing stream | |
358 | + microphoneStream = audioContext.createMediaStreamSource(stream); | |
359 | + | |
360 | + // Create AnalyserNode | |
361 | + analyser = audioContext.createAnalyser(); | |
362 | + analyser.fftSize = 512; | |
363 | + const bufferLength = analyser.frequencyBinCount; | |
364 | + dataArray = new Uint8Array(bufferLength); | |
365 | + | |
366 | + // Connect microphone to analyser | |
367 | + microphoneStream.connect(analyser); | |
368 | + | |
369 | + // Start visualizing | |
370 | + visualize(); | |
371 | + } catch (error) { | |
372 | + console.error('Error setting up volume meter:', error); | |
373 | + logNotice("Error setting up volume meter."); | |
374 | + } | |
375 | + } | |
376 | + | |
377 | + /** | |
378 | + * Visualizes the volume level on the volume meter. | |
379 | + */ | |
380 | + function visualize() { | |
381 | + const updateVolume = () => { | |
382 | + analyser.getByteFrequencyData(dataArray); | |
383 | + let sum = 0; | |
384 | + for (let i = 0; i < dataArray.length; i++) { | |
385 | + sum += dataArray[i]; | |
386 | + } | |
387 | + const average = sum / dataArray.length; | |
388 | + const volume = average / 255; // Normalize to [0,1] | |
389 | + | |
390 | + // Update the volume bar width | |
391 | + volumeBar.style.width = `${volume * 100}%`; | |
392 | + | |
393 | + // Change color based on volume level (green to red) | |
394 | + const hue = (1 - volume) * 120; // 120 (green) to 0 (red) | |
395 | + volumeBar.style.backgroundColor = `hsl(${hue}, 100%, 50%)`; | |
396 | + | |
397 | + animationId = requestAnimationFrame(updateVolume); | |
398 | + }; | |
399 | + | |
400 | + updateVolume(); | |
401 | + } | |
402 | + | |
403 | + /** | |
404 | + * Stops the volume meter visualization. | |
405 | + */ | |
406 | + function stopVolumeMeter() { | |
407 | + if (animationId) { | |
408 | + cancelAnimationFrame(animationId); | |
409 | + animationId = null; | |
410 | + } | |
411 | + if (volumeBar) { | |
412 | + volumeBar.style.width = '0%'; | |
413 | + volumeBar.style.backgroundColor = '#28a745'; // Reset to green | |
414 | + } | |
415 | + if (analyser) { | |
416 | + analyser.disconnect(); | |
417 | + analyser = null; | |
418 | + } | |
419 | + if (microphoneStream) { | |
420 | + microphoneStream.disconnect(); | |
421 | + microphoneStream = null; | |
422 | + } | |
423 | + if (audioContext) { | |
424 | + audioContext.close(); | |
425 | + audioContext = null; | |
426 | + } | |
427 | + } | |
428 | + | |
429 | + // LLM Integration | |
430 | + | |
431 | + /** | |
432 | + * Sends the transcription to the backend server for LLM processing. | |
433 | + * @param {string} transcription - The transcribed text. | |
434 | + */ | |
435 | + async function sendTranscriptionToLLM(transcription) { | |
436 | + try { | |
437 | + const response = await fetch(BACKEND_LLM_URL, { // Adjust the URL if your server is hosted elsewhere | |
438 | + method: 'POST', | |
439 | + headers: { | |
440 | + 'Content-Type': 'application/json', | |
441 | + }, | |
442 | + body: JSON.stringify({ transcription }), | |
443 | + }); | |
444 | + | |
445 | + if (!response.ok) { | |
446 | + throw new Error(`Server error: ${response.status}`); | |
447 | + } | |
448 | + | |
449 | + const data = await response.json(); | |
450 | + if (data.llmResponse) { | |
451 | + displayLLMResponse(data.llmResponse); | |
452 | + } | |
453 | + } catch (error) { | |
454 | + console.error('Error sending transcription to LLM:', error); | |
455 | + logNotice("Error processing transcription with LLM."); | |
456 | + } | |
457 | + } | |
458 | + | |
459 | + /** | |
460 | + * Displays the LLM's response in the transcriptions div. | |
461 | + * @param {object} llmResponse - The response from the LLM. | |
462 | + */ | |
463 | + function displayLLMResponse(llmResponse) { | |
464 | + // Adjust based on your LLM's response structure | |
465 | + const responseText = llmResponse.choices && llmResponse.choices[0] && llmResponse.choices[0].message && llmResponse.choices[0].message.content | |
466 | + ? llmResponse.choices[0].message.content | |
467 | + : 'No response from LLM.'; | |
468 | + | |
469 | + const responseLine = document.createElement('div'); | |
470 | + responseLine.classList.add('transcription'); | |
471 | + | |
472 | + const span = document.createElement('span'); | |
473 | + span.textContent = `LLM Response: ${responseText}`; | |
474 | + span.style.color = `hsl(200, 100%, 50%)`; // Example color | |
475 | + | |
476 | + responseLine.appendChild(span); | |
477 | + transcriptionsDiv.prepend(responseLine); | |
478 | + transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight; | |
479 | + } | |
480 | + | |
481 | + /** | |
482 | + * Sends the accumulated audio to the server. | |
483 | + */ | |
484 | + async function sendAudioToServer(audioBuffer) { | |
485 | + try { | |
486 | + const response = await fetch(BACKEND_UPLOAD_URL, { // Replace with your backend URL | |
487 | + method: 'POST', | |
488 | + headers: { | |
489 | + 'Content-Type': 'application/octet-stream', // Adjust based on server expectations | |
490 | + }, | |
491 | + body: audioBuffer, | |
492 | + }); | |
493 | + | |
494 | + if (!response.ok) { | |
495 | + throw new Error(`Server responded with status ${response.status}`); | |
496 | + } | |
497 | + | |
498 | + console.log('Audio sent successfully'); | |
499 | + } catch (error) { | |
500 | + console.error('Error sending audio:', error); | |
501 | + logNotice("Error sending audio to server."); | |
502 | + } | |
503 | + } | |
504 | + | |
505 | + /** | |
506 | + * Adds an audio element to the playlist. | |
507 | + * @param {string} audioUrl - The data URL of the audio. | |
508 | + * @returns {HTMLElement} - The created list item element. | |
509 | + */ | |
510 | + function addAudio(audioUrl) { | |
511 | + const entry = document.createElement("li"); | |
512 | + const audio = document.createElement("audio"); | |
513 | + audio.controls = true; | |
514 | + audio.src = audioUrl; | |
515 | + entry.classList.add("newItem"); | |
516 | + entry.appendChild(audio); | |
517 | + return entry; | |
518 | + } | |
519 | + | |
520 | + // Recording Control Functions | |
521 | + | |
522 | + /** | |
523 | + * Starts the Voice Activity Detection, Volume Meter, and WebSocket connection. | |
524 | + */ | |
525 | + async function startRecording() { | |
526 | + try { | |
527 | + // Request microphone access once | |
528 | + const stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false }); | |
529 | + | |
530 | + // Optionally, set up Volume Meter | |
531 | + // await setupVolumeMeter(stream); | |
532 | + | |
533 | + // Initialize VAD with the same stream | |
534 | + await initializeVAD(stream); | |
535 | + | |
536 | + // Set up WebSocket | |
537 | + setupWebSocket(); | |
538 | + } catch (error) { | |
539 | + console.error('Error starting recording:', error); | |
540 | + logNotice("Error starting recording. Please try again."); | |
541 | + } | |
542 | + } | |
543 | + | |
544 | + /** | |
545 | + * Stops the Voice Activity Detection, Volume Meter, and cleans up resources. | |
546 | + * @param {boolean} serverInitiated - Indicates if the stop was triggered by the server. | |
547 | + */ | |
548 | + function stopRecording(serverInitiated = false) { | |
549 | + if (!isRecording) return; | |
550 | + | |
551 | + // Stop VAD | |
552 | + if (vadInstance) { | |
553 | + if (typeof vadInstance.pause === 'function') { | |
554 | + vadInstance.pause(); | |
555 | + } else { | |
556 | + console.warn('VAD instance does not have a pause method.'); | |
557 | + } | |
558 | + vadInstance = null; | |
559 | + } | |
560 | + | |
561 | + // Optionally, stop Volume Meter | |
562 | + // stopVolumeMeter(); | |
563 | + | |
564 | + // Prevent reconnection if stopping manually | |
565 | + if (!serverInitiated) { | |
566 | + shouldReconnect = false; | |
567 | + } | |
568 | + | |
569 | + // Close WebSocket if not server-initiated | |
570 | + if (!serverInitiated && ws && ws.readyState === WebSocket.OPEN) { | |
571 | + ws.send(JSON.stringify({ action: "terminate" })); | |
572 | + logNotice("Termination signal sent to server."); | |
573 | + } | |
574 | + | |
575 | + // Close WebSocket | |
576 | + if (ws) { | |
577 | + ws.close(); | |
578 | + ws = null; | |
579 | + } | |
580 | + | |
581 | + // Reset recording state | |
582 | + isRecording = false; | |
583 | + recordBtn.textContent = 'START VAD'; | |
584 | + recordBtn.classList.remove('recording'); | |
585 | + logNotice("Recording stopped."); | |
586 | + | |
587 | + // Send the full transcription to the LLM | |
588 | + if (fullTranscription.trim().length > 0) { | |
589 | + sendTranscriptionToLLM(fullTranscription.trim()); | |
590 | + fullTranscription = ''; // Reset after sending | |
591 | + } | |
592 | + } | |
593 | + | |
594 | + /** | |
595 | + * Sends the accumulated audio to the server periodically. | |
596 | + */ | |
597 | + async function sendAudio() { | |
598 | + if (audioBuffer.length === 0) return; | |
599 | + | |
600 | + // Concatenate all frames into a single Float32Array | |
601 | + const totalLength = audioBuffer.reduce((sum, frame) => sum + frame.length, 0); | |
602 | + const concatenated = new Float32Array(totalLength); | |
603 | + let offset = 0; | |
604 | + audioBuffer.forEach(frame => { | |
605 | + concatenated.set(frame, offset); | |
606 | + offset += frame.length; | |
607 | + }); | |
608 | + | |
609 | + // Encode to WAV format | |
610 | + const wavBuffer = vad.utils.encodeWAV(concatenated); | |
611 | + | |
612 | + // Send the audio to the server | |
613 | + await sendAudioToServer(wavBuffer); | |
614 | + | |
615 | + // Optionally, add the audio to the UI | |
616 | + const base64 = vad.utils.arrayBufferToBase64(wavBuffer); | |
617 | + const audioUrl = `data:audio/wav;base64,${base64}`; | |
618 | + const audioElement = addAudio(audioUrl); | |
619 | + transcriptionsDiv.prepend(audioElement); | |
620 | + | |
621 | + // Reset the buffer | |
622 | + audioBuffer = []; | |
623 | + } | |
624 | + | |
625 | + // Button Event Listener | |
626 | + | |
627 | + /** | |
628 | + * Toggles recording state when the record button is clicked. | |
629 | + */ | |
630 | + window.toggleVAD = () => { | |
631 | + console.log("ran toggle vad"); | |
632 | + if (!isRecording) { | |
633 | + startRecording().catch(error => { | |
634 | + console.error('Error starting recording:', error); | |
635 | + logNotice("Error starting recording. Please try again."); | |
636 | + }); | |
637 | + } else { | |
638 | + stopRecording(); | |
639 | + } | |
640 | + }; | |
641 | + </script> | |
642 | +</body> | |
643 | +</html> |
+++ get_microphone.py
... | ... | @@ -0,0 +1,96 @@ |
1 | +import pyaudio | |
2 | +import wave | |
3 | + | |
4 | +pa = pyaudio.PyAudio() | |
5 | + | |
6 | +def get_microphone(): | |
7 | + """ | |
8 | + creates cli prompt that lists all microphones available, and waits for user choices, and returns values that is needed to open microphone from pyaudio. | |
9 | + uses dict as return type for ... I am very dumb and can not remember every line of the code. | |
10 | + :return: { | |
11 | + "device_num": device_num, | |
12 | + "microphone_channel_num": microphone_channel_num, | |
13 | + "microphone_sample_rate": microphone_sample_rate, | |
14 | + } | |
15 | + """ | |
16 | + | |
17 | + def get_valid_integer(prompt, min_value, max_value): | |
18 | + """ | |
19 | + Prompt the user for an integer input within a specified range. | |
20 | + Sanitizes non-integer input and values outside the range. | |
21 | + | |
22 | + :param prompt: The input prompt message | |
23 | + :param min_value: Minimum acceptable integer value (inclusive) | |
24 | + :param max_value: Maximum acceptable integer value (inclusive) | |
25 | + :return: A valid integer within the range [min_value, max_value] | |
26 | + """ | |
27 | + while True: | |
28 | + try: | |
29 | + user_input = input(prompt) | |
30 | + | |
31 | + value = int(user_input) | |
32 | + | |
33 | + if min_value <= value <= max_value: | |
34 | + return value | |
35 | + else: | |
36 | + print(f"Error: Please enter an integer between {min_value} and {max_value}.") | |
37 | + except ValueError: | |
38 | + print("Error: Invalid input. Please enter an integer.") | |
39 | + | |
40 | + # List all devices to see their indexes | |
41 | + audio_list = [] | |
42 | + for i in range(pa.get_device_count()): | |
43 | + dev = pa.get_device_info_by_index(i) | |
44 | + print( | |
45 | + i, | |
46 | + dev['name'], | |
47 | + dev['maxInputChannels'], | |
48 | + dev['defaultSampleRate'] | |
49 | + ) | |
50 | + audio_list.append(dev) | |
51 | + | |
52 | + mesg = "Which device is the microphone you are using?" | |
53 | + device_num = get_valid_integer(mesg, 0, len(audio_list) - 1) | |
54 | + | |
55 | + microphone_channel_num = audio_list[device_num]['maxInputChannels'] | |
56 | + # must pass int type, pyaudio only accepts int type | |
57 | + microphone_sample_rate = int(audio_list[device_num]['defaultSampleRate']) | |
58 | + try: | |
59 | + stream = pa.open( | |
60 | + format=pyaudio.paInt16, | |
61 | + channels=microphone_channel_num, | |
62 | + rate=microphone_sample_rate, | |
63 | + input=True, | |
64 | + frames_per_buffer=1024, | |
65 | + input_device_index=device_num, | |
66 | + ) | |
67 | + # Save to a valid WAV file | |
68 | + | |
69 | + frames = [] | |
70 | + for _ in range(0, int(microphone_sample_rate / 1024 * 1)): # Record for 5 seconds | |
71 | + data = stream.read(1024) | |
72 | + frames.append(data) | |
73 | + print("Recording finished.") | |
74 | + | |
75 | + output_filename = "test.wav" | |
76 | + with wave.open(output_filename, "wb") as wf: | |
77 | + wf.setnchannels(microphone_channel_num) | |
78 | + wf.setsampwidth(pa.get_sample_size(pyaudio.paInt16)) | |
79 | + wf.setframerate(microphone_sample_rate) | |
80 | + wf.writeframes(b"".join(frames)) | |
81 | + print("Recorded some data:", len(data)) | |
82 | + print("Looks Good To Me.") | |
83 | + stream.close() | |
84 | + pa.terminate() | |
85 | + except Exception as e: | |
86 | + print("Something went wrong. Can not open the audio device") | |
87 | + print(e) | |
88 | + | |
89 | + return { | |
90 | + "device_num": device_num, | |
91 | + "microphone_channel_num": microphone_channel_num, | |
92 | + "microphone_sample_rate": microphone_sample_rate, | |
93 | + } | |
94 | + | |
95 | +if __name__ == "__main__": | |
96 | + print(get_microphone())(파일 끝에 줄바꿈 문자 없음) |
+++ websocket_client.py
... | ... | @@ -0,0 +1,295 @@ |
1 | +import threading | |
2 | +import subprocess | |
3 | +import sys | |
4 | +import platform | |
5 | +import websocket | |
6 | +import json | |
7 | +import shlex | |
8 | + | |
9 | +from tkinter import Tk, Button, Label, Text, Scrollbar, END | |
10 | + | |
11 | +# Import the microphone selection helper | |
12 | +from get_microphone import get_microphone | |
13 | + | |
14 | +# 1) Server configuration | |
15 | +SERVER_URL = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko" | |
16 | + | |
17 | +# 2) Audio configuration | |
18 | +TARGET_RATE = 16000 # Resample to 16 kHz for the server | |
19 | +CHANNELS = 1 # Mono | |
20 | +FORMAT = 's16le' # 16-bit PCM little endian | |
21 | + | |
22 | + | |
23 | +# 3) FFmpeg configuration | |
24 | +def get_ffmpeg_command(device_info): | |
25 | + """ | |
26 | + Constructs the FFmpeg command based on the operating system and selected device. | |
27 | + | |
28 | + :param device_info: Dictionary containing device information from get_microphone() | |
29 | + :return: List of FFmpeg command arguments | |
30 | + """ | |
31 | + os_name = platform.system() | |
32 | + | |
33 | + if os_name == "Windows": | |
34 | + # For Windows, FFmpeg uses 'dshow' as the input device. | |
35 | + # device_info should contain the 'name' of the device as recognized by FFmpeg. | |
36 | + device_name = device_info.get("name", "default") | |
37 | + # Example device name: "Microphone (Realtek High Definition Audio)" | |
38 | + cmd = [ | |
39 | + "ffmpeg", | |
40 | + "-f", "dshow", | |
41 | + "-i", f"audio={device_name}", | |
42 | + "-ar", str(TARGET_RATE), | |
43 | + "-ac", str(CHANNELS), | |
44 | + "-f", FORMAT, | |
45 | + "pipe:1" | |
46 | + ] | |
47 | + elif os_name == "Darwin": | |
48 | + # For macOS, FFmpeg uses 'avfoundation'. | |
49 | + # device_info should contain the 'device_index' for audio. | |
50 | + device_index = device_info.get("device_index", "0") | |
51 | + # Example device index: "0" for default | |
52 | + cmd = [ | |
53 | + "ffmpeg", | |
54 | + "-f", "avfoundation", | |
55 | + "-i", f":{device_index}", | |
56 | + "-ar", str(TARGET_RATE), | |
57 | + "-ac", str(CHANNELS), | |
58 | + "-f", FORMAT, | |
59 | + "pipe:1" | |
60 | + ] | |
61 | + elif os_name == "Linux": | |
62 | + # For Linux, FFmpeg uses 'alsa'. | |
63 | + # device_info should contain the 'device_name' as recognized by FFmpeg. | |
64 | + device_name = device_info.get("name", "default") | |
65 | + # Example device name: "default" or "hw:1,0" | |
66 | + cmd = [ | |
67 | + "ffmpeg", | |
68 | + "-f", "alsa", | |
69 | + "-i", device_name, | |
70 | + "-ar", str(TARGET_RATE), | |
71 | + "-ac", str(CHANNELS), | |
72 | + "-f", FORMAT, | |
73 | + "pipe:1" | |
74 | + ] | |
75 | + else: | |
76 | + raise ValueError(f"Unsupported OS: {os_name}") | |
77 | + | |
78 | + return cmd | |
79 | + | |
80 | + | |
81 | +class SpeechToTextClient: | |
82 | + """ | |
83 | + A client that: | |
84 | + - Uses FFmpeg to capture and process audio | |
85 | + - Initializes a WebSocket connection | |
86 | + - Streams raw 16-bit PCM over the WebSocket | |
87 | + - Displays transcriptions from the server in the GUI | |
88 | + """ | |
89 | + | |
90 | + def __init__(self, gui): | |
91 | + """ | |
92 | + :param gui: An instance of the SpeechToTextGUI class for UI callbacks | |
93 | + """ | |
94 | + self.gui = gui | |
95 | + self.ws = None | |
96 | + self.ffmpeg_process = None | |
97 | + self.streaming_thread = None | |
98 | + self.running = False | |
99 | + | |
100 | + # Ask user to pick a device | |
101 | + mic_info = get_microphone() # Should return a dict with necessary device info | |
102 | + self.device_info = mic_info | |
103 | + | |
104 | + # Prepare the FFmpeg command | |
105 | + self.ffmpeg_cmd = get_ffmpeg_command(self.device_info) | |
106 | + | |
107 | + def start_recording(self): | |
108 | + """Starts FFmpeg, initializes the WebSocket connection, and begins streaming audio.""" | |
109 | + if self.running: | |
110 | + print("Already recording.") | |
111 | + return | |
112 | + | |
113 | + self.running = True | |
114 | + | |
115 | + # 1) Start FFmpeg subprocess | |
116 | + try: | |
117 | + self.ffmpeg_process = subprocess.Popen( | |
118 | + self.ffmpeg_cmd, | |
119 | + stdout=subprocess.PIPE, | |
120 | + stderr=subprocess.DEVNULL, # Suppress FFmpeg stderr; remove if debugging | |
121 | + bufsize=10 ** 8 | |
122 | + ) | |
123 | + print("FFmpeg started.") | |
124 | + except Exception as e: | |
125 | + print(f"Failed to start FFmpeg: {e}") | |
126 | + self.running = False | |
127 | + return | |
128 | + | |
129 | + # 2) Initialize the WebSocket connection | |
130 | + self.ws = websocket.WebSocketApp( | |
131 | + SERVER_URL, | |
132 | + on_message=self.on_message, | |
133 | + on_error=self.on_error, | |
134 | + on_close=self.on_close | |
135 | + ) | |
136 | + # Run WebSocket in a background thread | |
137 | + ws_thread = threading.Thread(target=self.ws.run_forever, daemon=True) | |
138 | + ws_thread.start() | |
139 | + print("WebSocket connection initiated.") | |
140 | + | |
141 | + # 3) Start audio streaming loop in a separate thread | |
142 | + self.streaming_thread = threading.Thread(target=self.audio_stream, daemon=True) | |
143 | + self.streaming_thread.start() | |
144 | + | |
145 | + self.gui.update_status("Recording started...") | |
146 | + | |
147 | + def stop_recording(self): | |
148 | + """Stops audio streaming, terminates FFmpeg, and closes the WebSocket.""" | |
149 | + if not self.running: | |
150 | + print("Not currently recording.") | |
151 | + return | |
152 | + | |
153 | + self.running = False | |
154 | + | |
155 | + # 1) Terminate FFmpeg subprocess | |
156 | + if self.ffmpeg_process: | |
157 | + self.ffmpeg_process.terminate() | |
158 | + self.ffmpeg_process = None | |
159 | + print("FFmpeg terminated.") | |
160 | + | |
161 | + # 2) Close WebSocket connection | |
162 | + if self.ws: | |
163 | + self.ws.close() | |
164 | + self.ws = None | |
165 | + print("WebSocket connection closed.") | |
166 | + | |
167 | + self.gui.update_status("Recording stopped...") | |
168 | + | |
169 | + def audio_stream(self): | |
170 | + """ | |
171 | + Continuously reads audio data from FFmpeg's stdout and sends it over WebSocket. | |
172 | + """ | |
173 | + try: | |
174 | + while self.running: | |
175 | + # Read a chunk of data | |
176 | + data = self.ffmpeg_process.stdout.read(4096) # Adjust chunk size as needed | |
177 | + if not data: | |
178 | + print("No more data from FFmpeg.") | |
179 | + break | |
180 | + | |
181 | + # Send audio frames over WebSocket (binary) | |
182 | + if self.ws and self.ws.sock and self.ws.sock.connected: | |
183 | + try: | |
184 | + self.ws.send(data, opcode=websocket.ABNF.OPCODE_BINARY) | |
185 | + except Exception as e: | |
186 | + print(f"Error sending data over WebSocket: {e}") | |
187 | + break | |
188 | + else: | |
189 | + print("WebSocket is not connected.") | |
190 | + break | |
191 | + | |
192 | + except Exception as e: | |
193 | + print(f"Error during audio streaming: {e}") | |
194 | + finally: | |
195 | + self.running = False | |
196 | + self.stop_recording() | |
197 | + | |
198 | + # --------------------- | |
199 | + # WebSocket Callbacks | |
200 | + # --------------------- | |
201 | + def on_message(self, ws, message): | |
202 | + """Handle transcriptions (or other messages) from the server.""" | |
203 | + print("Received from server:", message) | |
204 | + try: | |
205 | + data = json.loads(message) | |
206 | + transcription = data.get("text", "") | |
207 | + if transcription: | |
208 | + self.gui.display_transcription(transcription) | |
209 | + except json.JSONDecodeError: | |
210 | + print("Error: Received invalid JSON:", message) | |
211 | + | |
212 | + def on_error(self, ws, error): | |
213 | + """Handle any WebSocket errors.""" | |
214 | + print("WebSocket Error:", error) | |
215 | + | |
216 | + def on_close(self, ws, close_status_code, close_msg): | |
217 | + """Called when the WebSocket connection is closed.""" | |
218 | + print("WebSocket Closed") | |
219 | + | |
220 | + | |
221 | +class SpeechToTextGUI: | |
222 | + """ | |
223 | + The GUI class for user interaction: | |
224 | + - Start/Stop buttons | |
225 | + - Status updates | |
226 | + - Displays transcriptions | |
227 | + - Ties everything together with SpeechToTextClient | |
228 | + """ | |
229 | + | |
230 | + def __init__(self): | |
231 | + self.client = SpeechToTextClient(self) | |
232 | + | |
233 | + # Main window setup | |
234 | + self.root = Tk() | |
235 | + self.root.title("Speech-to-Text Client") | |
236 | + | |
237 | + # Status label | |
238 | + self.status_label = Label(self.root, text="Click 'Start Recording' to begin.", anchor="w") | |
239 | + self.status_label.pack(fill="x", padx=10, pady=5) | |
240 | + | |
241 | + # Text area for transcriptions | |
242 | + self.text_display = Text(self.root, wrap="word", height=20) | |
243 | + self.text_display.pack(fill="both", expand=True, padx=10, pady=5) | |
244 | + | |
245 | + # Scrollbar for transcription area | |
246 | + scrollbar = Scrollbar(self.text_display) | |
247 | + scrollbar.pack(side="right", fill="y") | |
248 | + self.text_display.config(yscrollcommand=scrollbar.set) | |
249 | + scrollbar.config(command=self.text_display.yview) | |
250 | + | |
251 | + # Start/Stop Buttons | |
252 | + start_button = Button( | |
253 | + self.root, | |
254 | + text="Start Recording", | |
255 | + command=self.client.start_recording, | |
256 | + bg="green", | |
257 | + fg="white" | |
258 | + ) | |
259 | + start_button.pack(side="left", padx=10, pady=10) | |
260 | + | |
261 | + stop_button = Button( | |
262 | + self.root, | |
263 | + text="Stop Recording", | |
264 | + command=self.client.stop_recording, | |
265 | + bg="red", | |
266 | + fg="white" | |
267 | + ) | |
268 | + stop_button.pack(side="right", padx=10, pady=10) | |
269 | + | |
270 | + # Handle window close event to ensure subprocesses are terminated | |
271 | + self.root.protocol("WM_DELETE_WINDOW", self.on_close) | |
272 | + | |
273 | + def update_status(self, message): | |
274 | + """Updates the status label.""" | |
275 | + self.status_label.config(text=message) | |
276 | + | |
277 | + def display_transcription(self, transcription): | |
278 | + """Appends transcriptions to the text box and scrolls to the end.""" | |
279 | + if transcription: | |
280 | + self.text_display.insert(END, transcription + "\n") | |
281 | + self.text_display.see(END) # Auto-scroll | |
282 | + | |
283 | + def on_close(self): | |
284 | + """Handle the window close event.""" | |
285 | + self.client.stop_recording() | |
286 | + self.root.destroy() | |
287 | + | |
288 | + def run(self): | |
289 | + """Start the Tkinter event loop.""" | |
290 | + self.root.mainloop() | |
291 | + | |
292 | + | |
293 | +if __name__ == "__main__": | |
294 | + gui = SpeechToTextGUI() | |
295 | + gui.run() |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?