Commit @9bb012f1031c90b35455366c2d904b48debd770c

윤영준 01-17

Hello Yona

@9bb012f1031c90b35455366c2d904b48debd770c

9bb012f

README.md (added)

+++ README.md

...	...	@@ -0,0 +1,1 @@
	1	+# whisper_client

9bb012f

client.html (added)

+++ client.html

...	...	@@ -0,0 +1,483 @@
	1	+<!DOCTYPE html>
	2	+<html lang="en">
	3	+<head>
	4	+ <meta charset="UTF-8">
	5	+ <title>Real-Time Audio Transcription with VAD and Volume Meter</title>
	6	+ <style>
	7	+ body {
	8	+ font-family: Arial, sans-serif;
	9	+ margin: 40px;
	10	+ background-color: #f5f5f5;
	11	+ }
	12	+ h1 {
	13	+ text-align: center;
	14	+ }
	15	+ #controls {
	16	+ text-align: center;
	17	+ margin-bottom: 20px;
	18	+ }
	19	+ #recordBtn {
	20	+ padding: 15px 30px;
	21	+ font-size: 18px;
	22	+ border: none;
	23	+ border-radius: 5px;
	24	+ background-color: #28a745; /* Green */
	25	+ color: white;
	26	+ cursor: pointer;
	27	+ transition: background-color 0.3s ease;
	28	+ }
	29	+ #recordBtn.recording {
	30	+ background-color: #dc3545; /* Red */
	31	+ }
	32	+ #transcriptions {
	33	+ max-width: 800px;
	34	+ margin: 0 auto;
	35	+ padding: 20px;
	36	+ background-color: white;
	37	+ border-radius: 8px;
	38	+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
	39	+ height: 600px;
	40	+ overflow-y: auto;
	41	+ white-space: pre-wrap;
	42	+ font-size: 16px;
	43	+ }
	44	+ .transcription {
	45	+ margin-bottom: 10px;
	46	+ }
	47	+ .notice {
	48	+ color: #dc3545; /* Red */
	49	+ font-style: italic;
	50	+ }
	51	+ /* Volume Meter Styles */
	52	+ #volumeMeter {
	53	+ width: 300px;
	54	+ height: 30px;
	55	+ background-color: #e0e0e0;
	56	+ border-radius: 15px;
	57	+ overflow: hidden;
	58	+ margin: 20px auto;
	59	+ position: relative;
	60	+ }
	61	+ #volumeBar {
	62	+ height: 100%;
	63	+ width: 0%;
	64	+ background-color: #28a745;
	65	+ transition: width 0.1s ease, background-color 0.1s ease;
	66	+ }
	67	+ </style>
	68	+</head>
	69	+<body>
	70	+ <h1>Real-Time Audio Transcription with VAD and Volume Meter</h1>
	71	+ <div id="controls">
	72	+ <button id="recordBtn">Start Recording</button>
	73	+ </div>
	74	+ <!-- Volume Meter -->
	75	+ <div id="volumeMeter">
	76	+ <div id="volumeBar"></div>
	77	+ </div>
	78	+ <div id="transcriptions"></div>
	79	+
	80	+ <!-- Include ONNX Runtime Web -->
	81	+ <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"></script>
	82	+ <!-- Include VAD-Web -->
	83	+ <script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script>
	84	+
	85	+ <script>
	86	+ // Elements
	87	+ const recordBtn = document.getElementById('recordBtn');
	88	+ const transcriptionsDiv = document.getElementById('transcriptions');
	89	+ const volumeBar = document.getElementById('volumeBar');
	90	+
	91	+ // State Variables
	92	+ let isRecording = false;
	93	+ let vadInstance = null; // Renamed to avoid conflict
	94	+ let ws = null;
	95	+ let audioContext = null;
	96	+ let analyser = null;
	97	+ let microphoneStream = null;
	98	+ let dataArray = null;
	99	+ let animationId = null;
	100	+ let reconnectInterval = 3000; // 3 seconds
	101	+ let shouldReconnect = false; // Flag to control reconnection
	102	+
	103	+ // Configuration
	104	+ const WS_ENDPOINT = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"; // Ensure this is correct
	105	+
	106	+ // Buffer to hold incoming data for JSON parsing
	107	+ let incomingBuffer = '';
	108	+
	109	+ // Utility Functions
	110	+
	111	+ /**
	112	+ * Logs transcription text with colored words based on probability.
	113	+ * @param {Array} words - Array of word objects with 'word' and 'probability'.
	114	+ */
	115	+ function logTranscription(words) {
	116	+ const transcriptionLine = document.createElement('div');
	117	+ transcriptionLine.classList.add('transcription');
	118	+
	119	+ words.forEach(wordObj => {
	120	+ const span = document.createElement('span');
	121	+ span.textContent = wordObj.word + ' '; // Add space after each word
	122	+
	123	+ // Calculate hue: 0 (red) to 240 (blue)
	124	+ const hue = wordObj.probability * 240;
	125	+ span.style.color = `hsl(${hue}, 100%, 50%)`;
	126	+
	127	+ transcriptionLine.appendChild(span);
	128	+ });
	129	+
	130	+ transcriptionsDiv.appendChild(transcriptionLine);
	131	+ transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
	132	+ }
	133	+
	134	+ /**
	135	+ * Logs notice messages (e.g., connection status, errors).
	136	+ * @param {string} text - The notice text to display.
	137	+ */
	138	+ function logNotice(text) {
	139	+ const p = document.createElement('p');
	140	+ p.classList.add('notice');
	141	+ p.textContent = text;
	142	+ transcriptionsDiv.appendChild(p);
	143	+ transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
	144	+ }
	145	+
	146	+ /**
	147	+ * Converts Float32 audio data to Int16 PCM format.
	148	+ * @param {Float32Array} buffer - The audio buffer in Float32 format.
	149	+ * @returns {Int16Array} - The audio buffer in Int16 format.
	150	+ */
	151	+ function convertFloat32ToInt16(buffer) {
	152	+ let l = buffer.length;
	153	+ const buf = new Int16Array(l);
	154	+ while (l--) {
	155	+ buf[l] = Math.min(1, buffer[l]) * 0x7FFF;
	156	+ }
	157	+ return buf;
	158	+ }
	159	+
	160	+ /**
	161	+ * Extracts JSON objects from a concatenated string.
	162	+ * @param {string} buffer - The concatenated JSON string.
	163	+ * @returns {Array} - An array of parsed JSON objects.
	164	+ */
	165	+ function extractJSONObjects(buffer) {
	166	+ const objects = [];
	167	+ let braceStack = 0;
	168	+ let inString = false;
	169	+ let escape = false;
	170	+ let lastSplit = 0;
	171	+
	172	+ for (let i = 0; i < buffer.length; i++) {
	173	+ const char = buffer[i];
	174	+
	175	+ if (char === '"' && !escape) {
	176	+ inString = !inString;
	177	+ }
	178	+
	179	+ if (!inString) {
	180	+ if (char === '{') {
	181	+ braceStack++;
	182	+ } else if (char === '}') {
	183	+ braceStack--;
	184	+ if (braceStack === 0) {
	185	+ const jsonString = buffer.slice(lastSplit, i + 1);
	186	+ try {
	187	+ const jsonObj = JSON.parse(jsonString);
	188	+ objects.push(jsonObj);
	189	+ } catch (e) {
	190	+ console.error('Failed to parse JSON:', e);
	191	+ }
	192	+ lastSplit = i + 1;
	193	+ }
	194	+ }
	195	+ }
	196	+
	197	+ // Handle escape characters
	198	+ if (char === '\\' && !escape) {
	199	+ escape = true;
	200	+ } else {
	201	+ escape = false;
	202	+ }
	203	+ }
	204	+
	205	+ // Return any remaining buffer that wasn't parsed
	206	+ incomingBuffer = buffer.slice(lastSplit);
	207	+ return objects;
	208	+ }
	209	+
	210	+ // WebSocket Handlers
	211	+
	212	+ /**
	213	+ * Sets up the WebSocket connection and defines event handlers.
	214	+ */
	215	+ function setupWebSocket() {
	216	+ ws = new WebSocket(WS_ENDPOINT);
	217	+ ws.binaryType = 'arraybuffer';
	218	+
	219	+ ws.onopen = () => {
	220	+ console.log('WebSocket connection opened.');
	221	+ logNotice("WebSocket connection established.");
	222	+ };
	223	+
	224	+ ws.onmessage = (event) => {
	225	+ let messageData = '';
	226	+
	227	+ if (typeof event.data === 'string') {
	228	+ messageData = event.data;
	229	+ } else if (event.data instanceof ArrayBuffer) {
	230	+ const decoder = new TextDecoder('utf-8');
	231	+ messageData = decoder.decode(event.data);
	232	+ } else {
	233	+ console.warn('Unsupported message format:', event.data);
	234	+ return;
	235	+ }
	236	+
	237	+ // Append incoming data to buffer
	238	+ incomingBuffer += messageData;
	239	+
	240	+ // Extract JSON objects
	241	+ const jsonObjects = extractJSONObjects(incomingBuffer);
	242	+
	243	+ // Process each JSON object
	244	+ jsonObjects.forEach(obj => {
	245	+ if (obj.task === "transcribe" && Array.isArray(obj.words)) {
	246	+ logTranscription(obj.words);
	247	+ }
	248	+ });
	249	+ };
	250	+
	251	+ ws.onclose = (event) => {
	252	+ console.log('WebSocket connection closed:', event);
	253	+ logNotice("WebSocket connection closed.");
	254	+ ws = null;
	255	+
	256	+ if (isRecording && shouldReconnect) {
	257	+ logNotice("Attempting to reconnect...");
	258	+ setTimeout(() => {
	259	+ setupWebSocket();
	260	+ }, reconnectInterval);
	261	+ } else if (isRecording) {
	262	+ logNotice("Transcription session ended.");
	263	+ stopRecording(true); // true indicates server-initiated stop
	264	+ }
	265	+ };
	266	+
	267	+ ws.onerror = (error) => {
	268	+ console.error('WebSocket error:', error);
	269	+ logNotice("WebSocket encountered an error.");
	270	+ };
	271	+ }
	272	+
	273	+ // Voice Activity Detection Setup
	274	+
	275	+ /**
	276	+ * Initializes the Voice Activity Detector (VAD) using Silero VAD.
	277	+ */
	278	+ async function initializeVAD(stream) {
	279	+ try {
	280	+ vadInstance = await vad.MicVAD.new({
	281	+ stream: stream, // Pass the existing MediaStream to avoid multiple microphone accesses
	282	+ onSpeechStart: () => {
	283	+ console.log("Speech start detected");
	284	+ logNotice("Speech detected...");
	285	+ },
	286	+ onSpeechEnd: (audio) => {
	287	+ console.log("Speech end detected");
	288	+ logNotice("Sending speech segment to server...");
	289	+
	290	+ // Convert Float32Array to Int16Array
	291	+ const int16Audio = convertFloat32ToInt16(audio);
	292	+
	293	+ // Send the audio buffer via WebSocket
	294	+ if (ws && ws.readyState === WebSocket.OPEN) {
	295	+ ws.send(int16Audio.buffer);
	296	+ } else {
	297	+ console.warn('WebSocket is not open. Cannot send audio.');
	298	+ logNotice("WebSocket is not open. Audio segment not sent.");
	299	+ }
	300	+ }
	301	+ });
	302	+ } catch (error) {
	303	+ console.error('Error initializing VAD:', error);
	304	+ logNotice("Error initializing Voice Activity Detection.");
	305	+ }
	306	+ }
	307	+
	308	+ // Volume Meter Setup
	309	+
	310	+ /**
	311	+ * Sets up the volume meter using the Web Audio API.
	312	+ */
	313	+ async function setupVolumeMeter(stream) {
	314	+ try {
	315	+ // Initialize AudioContext
	316	+ audioContext = new (window.AudioContext \|\| window.webkitAudioContext)();
	317	+
	318	+ // Create MediaStreamSource from the existing stream
	319	+ microphoneStream = audioContext.createMediaStreamSource(stream);
	320	+
	321	+ // Create AnalyserNode
	322	+ analyser = audioContext.createAnalyser();
	323	+ analyser.fftSize = 512;
	324	+ const bufferLength = analyser.frequencyBinCount;
	325	+ dataArray = new Uint8Array(bufferLength);
	326	+
	327	+ // Connect microphone to analyser
	328	+ microphoneStream.connect(analyser);
	329	+
	330	+ // Start visualizing
	331	+ visualize();
	332	+ } catch (error) {
	333	+ console.error('Error setting up volume meter:', error);
	334	+ logNotice("Error setting up volume meter.");
	335	+ }
	336	+ }
	337	+
	338	+ /**
	339	+ * Visualizes the volume level on the volume meter.
	340	+ */
	341	+ function visualize() {
	342	+ const updateVolume = () => {
	343	+ analyser.getByteFrequencyData(dataArray);
	344	+ let sum = 0;
	345	+ for (let i = 0; i < dataArray.length; i++) {
	346	+ sum += dataArray[i];
	347	+ }
	348	+ const average = sum / dataArray.length;
	349	+ const volume = average / 255; // Normalize to [0,1]
	350	+
	351	+ // Update the volume bar width
	352	+ volumeBar.style.width = `${volume * 100}%`;
	353	+
	354	+ // Change color based on volume level (green to red)
	355	+ const hue = (1 - volume) * 120; // 120 (green) to 0 (red)
	356	+ volumeBar.style.backgroundColor = `hsl(${hue}, 100%, 50%)`;
	357	+
	358	+ animationId = requestAnimationFrame(updateVolume);
	359	+ };
	360	+
	361	+ updateVolume();
	362	+ }
	363	+
	364	+ /**
	365	+ * Stops the volume meter visualization.
	366	+ */
	367	+ function stopVolumeMeter() {
	368	+ if (animationId) {
	369	+ cancelAnimationFrame(animationId);
	370	+ animationId = null;
	371	+ }
	372	+ if (volumeBar) {
	373	+ volumeBar.style.width = '0%';
	374	+ volumeBar.style.backgroundColor = '#28a745'; // Reset to green
	375	+ }
	376	+ if (analyser) {
	377	+ analyser.disconnect();
	378	+ analyser = null;
	379	+ }
	380	+ if (microphoneStream) {
	381	+ microphoneStream.disconnect();
	382	+ microphoneStream = null;
	383	+ }
	384	+ if (audioContext) {
	385	+ audioContext.close();
	386	+ audioContext = null;
	387	+ }
	388	+ }
	389	+
	390	+ // Recording Control Functions
	391	+
	392	+ /**
	393	+ * Starts the Voice Activity Detection, Volume Meter, and WebSocket connection.
	394	+ */
	395	+ async function startRecording() {
	396	+ try {
	397	+ // Request microphone access once
	398	+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
	399	+
	400	+ // Set up Volume Meter
	401	+ await setupVolumeMeter(stream);
	402	+
	403	+ // Initialize VAD with the same stream
	404	+ await initializeVAD(stream);
	405	+
	406	+ // Set up WebSocket
	407	+ shouldReconnect = true; // Enable reconnection attempts
	408	+ setupWebSocket();
	409	+
	410	+ // Start VAD
	411	+ if (vadInstance) {
	412	+ vadInstance.start();
	413	+ }
	414	+
	415	+ // Update UI
	416	+ isRecording = true;
	417	+ recordBtn.textContent = 'Stop Recording';
	418	+ recordBtn.classList.add('recording');
	419	+ logNotice("Recording started. Speak into your microphone.");
	420	+ } catch (error) {
	421	+ console.error('Error starting recording:', error);
	422	+ logNotice("Error starting recording. Please try again.");
	423	+ }
	424	+ }
	425	+
	426	+ /**
	427	+ * Stops the Voice Activity Detection, Volume Meter, and cleans up resources.
	428	+ * @param {boolean} serverInitiated - Indicates if the stop was triggered by the server.
	429	+ */
	430	+ function stopRecording(serverInitiated = false) {
	431	+ if (!isRecording) return;
	432	+
	433	+ // Stop VAD
	434	+ if (vadInstance) {
	435	+ vadInstance.pause();
	436	+ vadInstance = null;
	437	+ }
	438	+
	439	+ // Stop Volume Meter
	440	+ stopVolumeMeter();
	441	+
	442	+ // Prevent reconnection if stopping manually
	443	+ if (!serverInitiated) {
	444	+ shouldReconnect = false;
	445	+ }
	446	+
	447	+ // Close WebSocket if not server-initiated
	448	+ if (!serverInitiated && ws && ws.readyState === WebSocket.OPEN) {
	449	+ ws.send(JSON.stringify({ action: "terminate" }));
	450	+ logNotice("Termination signal sent to server.");
	451	+ }
	452	+
	453	+ // Close WebSocket
	454	+ if (ws) {
	455	+ ws.close();
	456	+ ws = null;
	457	+ }
	458	+
	459	+ // Reset recording state
	460	+ isRecording = false;
	461	+ recordBtn.textContent = 'Start Recording';
	462	+ recordBtn.classList.remove('recording');
	463	+ logNotice("Recording stopped.");
	464	+ }
	465	+
	466	+ // Button Event Listener
	467	+
	468	+ /**
	469	+ * Toggles recording state when the record button is clicked.
	470	+ */
	471	+ recordBtn.addEventListener('click', () => {
	472	+ if (!isRecording) {
	473	+ startRecording().catch(error => {
	474	+ console.error('Error starting recording:', error);
	475	+ logNotice("Error starting recording. Please try again.");
	476	+ });
	477	+ } else {
	478	+ stopRecording();
	479	+ }
	480	+ });
	481	+ </script>
	482	+</body>
	483	+</html>

9bb012f

client_with_openAI.html (added)

+++ client_with_openAI.html

...	...	@@ -0,0 +1,643 @@
	1	+<!DOCTYPE html>
	2	+<html lang="en">
	3	+<head>
	4	+ <meta charset="UTF-8">
	5	+ <title>Real-Time Audio Transcription with VAD and Volume Meter</title>
	6	+ <style>
	7	+ /* Your existing CSS styles */
	8	+ body {
	9	+ --indicator-color: black;
	10	+ background: radial-gradient(black 55%, var(--indicator-color));
	11	+ min-height: 100vh;
	12	+ color: white;
	13	+ margin: 0;
	14	+ font-family: Arial, sans-serif;
	15	+ }
	16	+ h1 {
	17	+ text-align: center;
	18	+ margin-top: 20px;
	19	+ }
	20	+ #controls {
	21	+ text-align: center;
	22	+ margin: 20px;
	23	+ }
	24	+ #toggle_vad_button {
	25	+ padding: 10px 20px;
	26	+ font-size: 16px;
	27	+ border: none;
	28	+ border-radius: 5px;
	29	+ background-color: #28a745; /* Green */
	30	+ color: white;
	31	+ cursor: pointer;
	32	+ transition: background-color 0.3s ease;
	33	+ }
	34	+ #toggle_vad_button.recording {
	35	+ background-color: #dc3545; /* Red */
	36	+ }
	37	+ #indicator {
	38	+ text-align: center;
	39	+ margin: 10px;
	40	+ font-size: 18px;
	41	+ }
	42	+ #playlist {
	43	+ max-width: 800px;
	44	+ margin: 0 auto;
	45	+ padding: 20px;
	46	+ background-color: rgba(255, 255, 255, 0.1);
	47	+ border-radius: 8px;
	48	+ height: 400px;
	49	+ overflow-y: scroll;
	50	+ list-style: none;
	51	+ padding-left: 0;
	52	+ }
	53	+ #playlist li {
	54	+ margin-bottom: 10px;
	55	+ opacity: 0;
	56	+ animation: fadeIn 1s forwards;
	57	+ }
	58	+ #playlist li.newItem {
	59	+ border-left: 4px solid #28a745;
	60	+ padding-left: 10px;
	61	+ }
	62	+ .transcription {
	63	+ color: white;
	64	+ font-size: 16px;
	65	+ }
	66	+ .notice {
	67	+ color: #dc3545; /* Red */
	68	+ font-style: italic;
	69	+ }
	70	+ @keyframes fadeIn {
	71	+ to {
	72	+ opacity: 1;
	73	+ }
	74	+ }
	75	+ </style>
	76	+</head>
	77	+<body>
	78	+ <h1>Real-Time Audio Transcription with VAD and Volume Meter</h1>
	79	+ <div id="controls">
	80	+ <button id="toggle_vad_button" onclick="window.toggleVAD()" disabled>START VAD</button>
	81	+ </div>
	82	+ <div id="indicator">VAD is <span style="color: red">LOADING</span></div>
	83	+ <ol id="playlist" reversed></ol>
	84	+
	85	+ <!-- Include ONNX Runtime Web -->
	86	+ <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"></script>
	87	+ <!-- Include VAD-Web -->
	88	+ <script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script>
	89	+
	90	+ <script type="module">
	91	+ import { interpolateInferno } from "https://cdn.skypack.dev/d3-scale-chromatic@3";
	92	+
	93	+ // Elements
	94	+ const recordBtn = document.getElementById('toggle_vad_button');
	95	+ const transcriptionsDiv = document.getElementById('playlist');
	96	+ const indicator = document.getElementById('indicator');
	97	+
	98	+ // State Variables
	99	+ let isRecording = false;
	100	+ let vadInstance = null;
	101	+ let audioContext = null;
	102	+ let analyser = null;
	103	+ let microphoneStream = null;
	104	+ let dataArray = null;
	105	+ let animationId = null;
	106	+ let isSpeaking = false;
	107	+ let audioBuffer = [];
	108	+ let sendAudioInterval = null;
	109	+ const SEND_INTERVAL_MS = 1000; // 1 second
	110	+ let ws = null;
	111	+ let incomingBuffer = '';
	112	+ let fullTranscription = ''; // To accumulate full transcription
	113	+
	114	+ // Configuration
	115	+ const WS_ENDPOINT = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"; // Ensure this is correct
	116	+ const BACKEND_UPLOAD_URL = "http://localhost:3000/upload-audio"; // Replace with your backend URL
	117	+ const BACKEND_LLM_URL = "http://localhost:3000/process-transcription"; // Replace with your backend URL
	118	+
	119	+ // Utility Functions
	120	+
	121	+ /**
	122	+ * Logs transcription text with colored words based on probability.
	123	+ * @param {Array} words - Array of word objects with 'word' and 'probability'.
	124	+ */
	125	+ function logTranscription(words) {
	126	+ const transcriptionLine = document.createElement('div');
	127	+ transcriptionLine.classList.add('transcription');
	128	+
	129	+ words.forEach(wordObj => {
	130	+ const span = document.createElement('span');
	131	+ span.textContent = wordObj.word + ' '; // Add space after each word
	132	+
	133	+ // Calculate hue: 0 (red) to 240 (blue)
	134	+ const hue = wordObj.probability * 240;
	135	+ span.style.color = `hsl(${hue}, 100%, 50%)`;
	136	+
	137	+ transcriptionLine.appendChild(span);
	138	+ fullTranscription += wordObj.word + ' ';
	139	+ });
	140	+
	141	+ transcriptionsDiv.prepend(transcriptionLine);
	142	+ transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
	143	+ }
	144	+
	145	+ /**
	146	+ * Logs notice messages (e.g., connection status, errors).
	147	+ * @param {string} text - The notice text to display.
	148	+ */
	149	+ function logNotice(text) {
	150	+ const p = document.createElement('p');
	151	+ p.classList.add('notice');
	152	+ p.textContent = text;
	153	+ transcriptionsDiv.prepend(p);
	154	+ transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
	155	+ }
	156	+
	157	+ /**
	158	+ * Converts Float32 audio data to Int16 PCM format.
	159	+ * @param {Float32Array} buffer - The audio buffer in Float32 format.
	160	+ * @returns {Int16Array} - The audio buffer in Int16 format.
	161	+ */
	162	+ function convertFloat32ToInt16(buffer) {
	163	+ let l = buffer.length;
	164	+ const buf = new Int16Array(l);
	165	+ while (l--) {
	166	+ buf[l] = Math.min(1, buffer[l]) * 0x7FFF;
	167	+ }
	168	+ return buf;
	169	+ }
	170	+
	171	+ /**
	172	+ * Extracts JSON objects from a concatenated string.
	173	+ * @param {string} buffer - The concatenated JSON string.
	174	+ * @returns {Array} - An array of parsed JSON objects.
	175	+ */
	176	+ function extractJSONObjects(buffer) {
	177	+ const objects = [];
	178	+ let braceStack = 0;
	179	+ let inString = false;
	180	+ let escape = false;
	181	+ let lastSplit = 0;
	182	+
	183	+ for (let i = 0; i < buffer.length; i++) {
	184	+ const char = buffer[i];
	185	+
	186	+ if (char === '"' && !escape) {
	187	+ inString = !inString;
	188	+ }
	189	+
	190	+ if (!inString) {
	191	+ if (char === '{') {
	192	+ braceStack++;
	193	+ } else if (char === '}') {
	194	+ braceStack--;
	195	+ if (braceStack === 0) {
	196	+ const jsonString = buffer.slice(lastSplit, i + 1);
	197	+ try {
	198	+ const jsonObj = JSON.parse(jsonString);
	199	+ objects.push(jsonObj);
	200	+ } catch (e) {
	201	+ console.error('Failed to parse JSON:', e);
	202	+ }
	203	+ lastSplit = i + 1;
	204	+ }
	205	+ }
	206	+ }
	207	+
	208	+ // Handle escape characters
	209	+ if (char === '\\' && !escape) {
	210	+ escape = true;
	211	+ } else {
	212	+ escape = false;
	213	+ }
	214	+ }
	215	+
	216	+ // Return any remaining buffer that wasn't parsed
	217	+ incomingBuffer = buffer.slice(lastSplit);
	218	+ return objects;
	219	+ }
	220	+
	221	+ // WebSocket Handlers
	222	+
	223	+ /**
	224	+ * Sets up the WebSocket connection and defines event handlers.
	225	+ */
	226	+ function setupWebSocket() {
	227	+ ws = new WebSocket(WS_ENDPOINT);
	228	+ ws.binaryType = 'arraybuffer';
	229	+
	230	+ ws.onopen = () => {
	231	+ console.log('WebSocket connection opened.');
	232	+ logNotice("WebSocket connection established.");
	233	+ };
	234	+
	235	+ ws.onmessage = (event) => {
	236	+ let messageData = '';
	237	+
	238	+ if (typeof event.data === 'string') {
	239	+ messageData = event.data;
	240	+ } else if (event.data instanceof ArrayBuffer) {
	241	+ const decoder = new TextDecoder('utf-8');
	242	+ messageData = decoder.decode(event.data);
	243	+ } else {
	244	+ console.warn('Unsupported message format:', event.data);
	245	+ return;
	246	+ }
	247	+
	248	+ // Append incoming data to buffer
	249	+ incomingBuffer += messageData;
	250	+
	251	+ // Extract JSON objects
	252	+ const jsonObjects = extractJSONObjects(incomingBuffer);
	253	+
	254	+ // Process each JSON object
	255	+ jsonObjects.forEach(obj => {
	256	+ if (obj.task === "transcribe" && Array.isArray(obj.words)) {
	257	+ logTranscription(obj.words);
	258	+ }
	259	+ });
	260	+ };
	261	+
	262	+ ws.onclose = (event) => {
	263	+ console.log('WebSocket connection closed:', event);
	264	+ logNotice("WebSocket connection closed.");
	265	+ ws = null;
	266	+
	267	+ if (isRecording && shouldReconnect) {
	268	+ logNotice("Attempting to reconnect...");
	269	+ setTimeout(() => {
	270	+ setupWebSocket();
	271	+ }, reconnectInterval);
	272	+ } else if (isRecording) {
	273	+ logNotice("Transcription session ended.");
	274	+ stopRecording(true); // true indicates server-initiated stop
	275	+ }
	276	+ };
	277	+
	278	+ ws.onerror = (error) => {
	279	+ console.error('WebSocket error:', error);
	280	+ logNotice("WebSocket encountered an error.");
	281	+ };
	282	+ }
	283	+
	284	+ // Voice Activity Detection Setup
	285	+
	286	+ /**
	287	+ * Initializes the Voice Activity Detector (VAD) using MicVAD.
	288	+ */
	289	+ async function initializeVAD(stream) {
	290	+ try {
	291	+ vadInstance = await vad.MicVAD.new({
	292	+ stream: stream, // Pass the existing MediaStream to avoid multiple microphone accesses
	293	+ onSpeechStart: () => {
	294	+ console.log("Speech start detected");
	295	+ logNotice("Speech detected...");
	296	+
	297	+ isSpeaking = true;
	298	+ audioBuffer = []; // Reset buffer
	299	+
	300	+ // Start timer to send audio every second
	301	+ sendAudioInterval = setInterval(sendAudio, SEND_INTERVAL_MS);
	302	+ },
	303	+ onSpeechEnd: (audio) => {
	304	+ console.log("Speech end detected");
	305	+ logNotice("Sending final speech segment to server...");
	306	+
	307	+ isSpeaking = false;
	308	+
	309	+ // Send any remaining audio
	310	+ sendAudio();
	311	+
	312	+ // Stop the timer
	313	+ if (sendAudioInterval) {
	314	+ clearInterval(sendAudioInterval);
	315	+ sendAudioInterval = null;
	316	+ }
	317	+
	318	+ // Optionally, send the final `audio` provided by the callback
	319	+ // depending on your application's needs
	320	+ // Example:
	321	+ // sendFinalAudio(audio);
	322	+ },
	323	+ onFrameProcessed: (probabilities, frame) => {
	324	+ const indicatorColor = interpolateInferno(probabilities.isSpeech / 2);
	325	+ document.body.style.setProperty("--indicator-color", indicatorColor);
	326	+
	327	+ if (isSpeaking) {
	328	+ audioBuffer.push(frame);
	329	+ }
	330	+ },
	331	+ });
	332	+
	333	+ window.vadInstance = vadInstance;
	334	+
	335	+ // Start VAD listening
	336	+ vadInstance.start();
	337	+ isRecording = true;
	338	+ recordBtn.textContent = 'STOP VAD';
	339	+ recordBtn.classList.add('recording');
	340	+ logNotice("Recording started. Speak into your microphone.");
	341	+ } catch (error) {
	342	+ console.error('Error initializing VAD:', error);
	343	+ logNotice("Error initializing Voice Activity Detection.");
	344	+ }
	345	+ }
	346	+
	347	+ // Volume Meter Setup (Optional, based on your requirements)
	348	+
	349	+ /**
	350	+ * Sets up the volume meter using the Web Audio API.
	351	+ */
	352	+ async function setupVolumeMeter(stream) {
	353	+ try {
	354	+ // Initialize AudioContext
	355	+ audioContext = new (window.AudioContext \|\| window.webkitAudioContext)();
	356	+
	357	+ // Create MediaStreamSource from the existing stream
	358	+ microphoneStream = audioContext.createMediaStreamSource(stream);
	359	+
	360	+ // Create AnalyserNode
	361	+ analyser = audioContext.createAnalyser();
	362	+ analyser.fftSize = 512;
	363	+ const bufferLength = analyser.frequencyBinCount;
	364	+ dataArray = new Uint8Array(bufferLength);
	365	+
	366	+ // Connect microphone to analyser
	367	+ microphoneStream.connect(analyser);
	368	+
	369	+ // Start visualizing
	370	+ visualize();
	371	+ } catch (error) {
	372	+ console.error('Error setting up volume meter:', error);
	373	+ logNotice("Error setting up volume meter.");
	374	+ }
	375	+ }
	376	+
	377	+ /**
	378	+ * Visualizes the volume level on the volume meter.
	379	+ */
	380	+ function visualize() {
	381	+ const updateVolume = () => {
	382	+ analyser.getByteFrequencyData(dataArray);
	383	+ let sum = 0;
	384	+ for (let i = 0; i < dataArray.length; i++) {
	385	+ sum += dataArray[i];
	386	+ }
	387	+ const average = sum / dataArray.length;
	388	+ const volume = average / 255; // Normalize to [0,1]
	389	+
	390	+ // Update the volume bar width
	391	+ volumeBar.style.width = `${volume * 100}%`;
	392	+
	393	+ // Change color based on volume level (green to red)
	394	+ const hue = (1 - volume) * 120; // 120 (green) to 0 (red)
	395	+ volumeBar.style.backgroundColor = `hsl(${hue}, 100%, 50%)`;
	396	+
	397	+ animationId = requestAnimationFrame(updateVolume);
	398	+ };
	399	+
	400	+ updateVolume();
	401	+ }
	402	+
	403	+ /**
	404	+ * Stops the volume meter visualization.
	405	+ */
	406	+ function stopVolumeMeter() {
	407	+ if (animationId) {
	408	+ cancelAnimationFrame(animationId);
	409	+ animationId = null;
	410	+ }
	411	+ if (volumeBar) {
	412	+ volumeBar.style.width = '0%';
	413	+ volumeBar.style.backgroundColor = '#28a745'; // Reset to green
	414	+ }
	415	+ if (analyser) {
	416	+ analyser.disconnect();
	417	+ analyser = null;
	418	+ }
	419	+ if (microphoneStream) {
	420	+ microphoneStream.disconnect();
	421	+ microphoneStream = null;
	422	+ }
	423	+ if (audioContext) {
	424	+ audioContext.close();
	425	+ audioContext = null;
	426	+ }
	427	+ }
	428	+
	429	+ // LLM Integration
	430	+
	431	+ /**
	432	+ * Sends the transcription to the backend server for LLM processing.
	433	+ * @param {string} transcription - The transcribed text.
	434	+ */
	435	+ async function sendTranscriptionToLLM(transcription) {
	436	+ try {
	437	+ const response = await fetch(BACKEND_LLM_URL, { // Adjust the URL if your server is hosted elsewhere
	438	+ method: 'POST',
	439	+ headers: {
	440	+ 'Content-Type': 'application/json',
	441	+ },
	442	+ body: JSON.stringify({ transcription }),
	443	+ });
	444	+
	445	+ if (!response.ok) {
	446	+ throw new Error(`Server error: ${response.status}`);
	447	+ }
	448	+
	449	+ const data = await response.json();
	450	+ if (data.llmResponse) {
	451	+ displayLLMResponse(data.llmResponse);
	452	+ }
	453	+ } catch (error) {
	454	+ console.error('Error sending transcription to LLM:', error);
	455	+ logNotice("Error processing transcription with LLM.");
	456	+ }
	457	+ }
	458	+
	459	+ /**
	460	+ * Displays the LLM's response in the transcriptions div.
	461	+ * @param {object} llmResponse - The response from the LLM.
	462	+ */
	463	+ function displayLLMResponse(llmResponse) {
	464	+ // Adjust based on your LLM's response structure
	465	+ const responseText = llmResponse.choices && llmResponse.choices[0] && llmResponse.choices[0].message && llmResponse.choices[0].message.content
	466	+ ? llmResponse.choices[0].message.content
	467	+ : 'No response from LLM.';
	468	+
	469	+ const responseLine = document.createElement('div');
	470	+ responseLine.classList.add('transcription');
	471	+
	472	+ const span = document.createElement('span');
	473	+ span.textContent = `LLM Response: ${responseText}`;
	474	+ span.style.color = `hsl(200, 100%, 50%)`; // Example color
	475	+
	476	+ responseLine.appendChild(span);
	477	+ transcriptionsDiv.prepend(responseLine);
	478	+ transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
	479	+ }
	480	+
	481	+ /**
	482	+ * Sends the accumulated audio to the server.
	483	+ */
	484	+ async function sendAudioToServer(audioBuffer) {
	485	+ try {
	486	+ const response = await fetch(BACKEND_UPLOAD_URL, { // Replace with your backend URL
	487	+ method: 'POST',
	488	+ headers: {
	489	+ 'Content-Type': 'application/octet-stream', // Adjust based on server expectations
	490	+ },
	491	+ body: audioBuffer,
	492	+ });
	493	+
	494	+ if (!response.ok) {
	495	+ throw new Error(`Server responded with status ${response.status}`);
	496	+ }
	497	+
	498	+ console.log('Audio sent successfully');
	499	+ } catch (error) {
	500	+ console.error('Error sending audio:', error);
	501	+ logNotice("Error sending audio to server.");
	502	+ }
	503	+ }
	504	+
	505	+ /**
	506	+ * Adds an audio element to the playlist.
	507	+ * @param {string} audioUrl - The data URL of the audio.
	508	+ * @returns {HTMLElement} - The created list item element.
	509	+ */
	510	+ function addAudio(audioUrl) {
	511	+ const entry = document.createElement("li");
	512	+ const audio = document.createElement("audio");
	513	+ audio.controls = true;
	514	+ audio.src = audioUrl;
	515	+ entry.classList.add("newItem");
	516	+ entry.appendChild(audio);
	517	+ return entry;
	518	+ }
	519	+
	520	+ // Recording Control Functions
	521	+
	522	+ /**
	523	+ * Starts the Voice Activity Detection, Volume Meter, and WebSocket connection.
	524	+ */
	525	+ async function startRecording() {
	526	+ try {
	527	+ // Request microphone access once
	528	+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
	529	+
	530	+ // Optionally, set up Volume Meter
	531	+ // await setupVolumeMeter(stream);
	532	+
	533	+ // Initialize VAD with the same stream
	534	+ await initializeVAD(stream);
	535	+
	536	+ // Set up WebSocket
	537	+ setupWebSocket();
	538	+ } catch (error) {
	539	+ console.error('Error starting recording:', error);
	540	+ logNotice("Error starting recording. Please try again.");
	541	+ }
	542	+ }
	543	+
	544	+ /**
	545	+ * Stops the Voice Activity Detection, Volume Meter, and cleans up resources.
	546	+ * @param {boolean} serverInitiated - Indicates if the stop was triggered by the server.
	547	+ */
	548	+ function stopRecording(serverInitiated = false) {
	549	+ if (!isRecording) return;
	550	+
	551	+ // Stop VAD
	552	+ if (vadInstance) {
	553	+ if (typeof vadInstance.pause === 'function') {
	554	+ vadInstance.pause();
	555	+ } else {
	556	+ console.warn('VAD instance does not have a pause method.');
	557	+ }
	558	+ vadInstance = null;
	559	+ }
	560	+
	561	+ // Optionally, stop Volume Meter
	562	+ // stopVolumeMeter();
	563	+
	564	+ // Prevent reconnection if stopping manually
	565	+ if (!serverInitiated) {
	566	+ shouldReconnect = false;
	567	+ }
	568	+
	569	+ // Close WebSocket if not server-initiated
	570	+ if (!serverInitiated && ws && ws.readyState === WebSocket.OPEN) {
	571	+ ws.send(JSON.stringify({ action: "terminate" }));
	572	+ logNotice("Termination signal sent to server.");
	573	+ }
	574	+
	575	+ // Close WebSocket
	576	+ if (ws) {
	577	+ ws.close();
	578	+ ws = null;
	579	+ }
	580	+
	581	+ // Reset recording state
	582	+ isRecording = false;
	583	+ recordBtn.textContent = 'START VAD';
	584	+ recordBtn.classList.remove('recording');
	585	+ logNotice("Recording stopped.");
	586	+
	587	+ // Send the full transcription to the LLM
	588	+ if (fullTranscription.trim().length > 0) {
	589	+ sendTranscriptionToLLM(fullTranscription.trim());
	590	+ fullTranscription = ''; // Reset after sending
	591	+ }
	592	+ }
	593	+
	594	+ /**
	595	+ * Sends the accumulated audio to the server periodically.
	596	+ */
	597	+ async function sendAudio() {
	598	+ if (audioBuffer.length === 0) return;
	599	+
	600	+ // Concatenate all frames into a single Float32Array
	601	+ const totalLength = audioBuffer.reduce((sum, frame) => sum + frame.length, 0);
	602	+ const concatenated = new Float32Array(totalLength);
	603	+ let offset = 0;
	604	+ audioBuffer.forEach(frame => {
	605	+ concatenated.set(frame, offset);
	606	+ offset += frame.length;
	607	+ });
	608	+
	609	+ // Encode to WAV format
	610	+ const wavBuffer = vad.utils.encodeWAV(concatenated);
	611	+
	612	+ // Send the audio to the server
	613	+ await sendAudioToServer(wavBuffer);
	614	+
	615	+ // Optionally, add the audio to the UI
	616	+ const base64 = vad.utils.arrayBufferToBase64(wavBuffer);
	617	+ const audioUrl = `data:audio/wav;base64,${base64}`;
	618	+ const audioElement = addAudio(audioUrl);
	619	+ transcriptionsDiv.prepend(audioElement);
	620	+
	621	+ // Reset the buffer
	622	+ audioBuffer = [];
	623	+ }
	624	+
	625	+ // Button Event Listener
	626	+
	627	+ /**
	628	+ * Toggles recording state when the record button is clicked.
	629	+ */
	630	+ window.toggleVAD = () => {
	631	+ console.log("ran toggle vad");
	632	+ if (!isRecording) {
	633	+ startRecording().catch(error => {
	634	+ console.error('Error starting recording:', error);
	635	+ logNotice("Error starting recording. Please try again.");
	636	+ });
	637	+ } else {
	638	+ stopRecording();
	639	+ }
	640	+ };
	641	+ </script>
	642	+</body>
	643	+</html>

9bb012f

get_microphone.py (added)

+++ get_microphone.py

...	...	@@ -0,0 +1,96 @@
	1	+import pyaudio
	2	+import wave
	3	+
	4	+pa = pyaudio.PyAudio()
	5	+
	6	+def get_microphone():
	7	+ """
	8	+ creates cli prompt that lists all microphones available, and waits for user choices, and returns values that is needed to open microphone from pyaudio.
	9	+ uses dict as return type for ... I am very dumb and can not remember every line of the code.
	10	+ :return: {
	11	+ "device_num": device_num,
	12	+ "microphone_channel_num": microphone_channel_num,
	13	+ "microphone_sample_rate": microphone_sample_rate,
	14	+ }
	15	+ """
	16	+
	17	+ def get_valid_integer(prompt, min_value, max_value):
	18	+ """
	19	+ Prompt the user for an integer input within a specified range.
	20	+ Sanitizes non-integer input and values outside the range.
	21	+
	22	+ :param prompt: The input prompt message
	23	+ :param min_value: Minimum acceptable integer value (inclusive)
	24	+ :param max_value: Maximum acceptable integer value (inclusive)
	25	+ :return: A valid integer within the range [min_value, max_value]
	26	+ """
	27	+ while True:
	28	+ try:
	29	+ user_input = input(prompt)
	30	+
	31	+ value = int(user_input)
	32	+
	33	+ if min_value <= value <= max_value:
	34	+ return value
	35	+ else:
	36	+ print(f"Error: Please enter an integer between {min_value} and {max_value}.")
	37	+ except ValueError:
	38	+ print("Error: Invalid input. Please enter an integer.")
	39	+
	40	+ # List all devices to see their indexes
	41	+ audio_list = []
	42	+ for i in range(pa.get_device_count()):
	43	+ dev = pa.get_device_info_by_index(i)
	44	+ print(
	45	+ i,
	46	+ dev['name'],
	47	+ dev['maxInputChannels'],
	48	+ dev['defaultSampleRate']
	49	+ )
	50	+ audio_list.append(dev)
	51	+
	52	+ mesg = "Which device is the microphone you are using?"
	53	+ device_num = get_valid_integer(mesg, 0, len(audio_list) - 1)
	54	+
	55	+ microphone_channel_num = audio_list[device_num]['maxInputChannels']
	56	+ # must pass int type, pyaudio only accepts int type
	57	+ microphone_sample_rate = int(audio_list[device_num]['defaultSampleRate'])
	58	+ try:
	59	+ stream = pa.open(
	60	+ format=pyaudio.paInt16,
	61	+ channels=microphone_channel_num,
	62	+ rate=microphone_sample_rate,
	63	+ input=True,
	64	+ frames_per_buffer=1024,
	65	+ input_device_index=device_num,
	66	+ )
	67	+ # Save to a valid WAV file
	68	+
	69	+ frames = []
	70	+ for _ in range(0, int(microphone_sample_rate / 1024 * 1)): # Record for 5 seconds
	71	+ data = stream.read(1024)
	72	+ frames.append(data)
	73	+ print("Recording finished.")
	74	+
	75	+ output_filename = "test.wav"
	76	+ with wave.open(output_filename, "wb") as wf:
	77	+ wf.setnchannels(microphone_channel_num)
	78	+ wf.setsampwidth(pa.get_sample_size(pyaudio.paInt16))
	79	+ wf.setframerate(microphone_sample_rate)
	80	+ wf.writeframes(b"".join(frames))
	81	+ print("Recorded some data:", len(data))
	82	+ print("Looks Good To Me.")
	83	+ stream.close()
	84	+ pa.terminate()
	85	+ except Exception as e:
	86	+ print("Something went wrong. Can not open the audio device")
	87	+ print(e)
	88	+
	89	+ return {
	90	+ "device_num": device_num,
	91	+ "microphone_channel_num": microphone_channel_num,
	92	+ "microphone_sample_rate": microphone_sample_rate,
	93	+ }
	94	+
	95	+if __name__ == "__main__":
	96	+ print(get_microphone())(파일 끝에 줄바꿈 문자 없음)

9bb012f

websocket_client.py (added)

+++ websocket_client.py

...	...	@@ -0,0 +1,295 @@
	1	+import threading
	2	+import subprocess
	3	+import sys
	4	+import platform
	5	+import websocket
	6	+import json
	7	+import shlex
	8	+
	9	+from tkinter import Tk, Button, Label, Text, Scrollbar, END
	10	+
	11	+# Import the microphone selection helper
	12	+from get_microphone import get_microphone
	13	+
	14	+# 1) Server configuration
	15	+SERVER_URL = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"
	16	+
	17	+# 2) Audio configuration
	18	+TARGET_RATE = 16000 # Resample to 16 kHz for the server
	19	+CHANNELS = 1 # Mono
	20	+FORMAT = 's16le' # 16-bit PCM little endian
	21	+
	22	+
	23	+# 3) FFmpeg configuration
	24	+def get_ffmpeg_command(device_info):
	25	+ """
	26	+ Constructs the FFmpeg command based on the operating system and selected device.
	27	+
	28	+ :param device_info: Dictionary containing device information from get_microphone()
	29	+ :return: List of FFmpeg command arguments
	30	+ """
	31	+ os_name = platform.system()
	32	+
	33	+ if os_name == "Windows":
	34	+ # For Windows, FFmpeg uses 'dshow' as the input device.
	35	+ # device_info should contain the 'name' of the device as recognized by FFmpeg.
	36	+ device_name = device_info.get("name", "default")
	37	+ # Example device name: "Microphone (Realtek High Definition Audio)"
	38	+ cmd = [
	39	+ "ffmpeg",
	40	+ "-f", "dshow",
	41	+ "-i", f"audio={device_name}",
	42	+ "-ar", str(TARGET_RATE),
	43	+ "-ac", str(CHANNELS),
	44	+ "-f", FORMAT,
	45	+ "pipe:1"
	46	+ ]
	47	+ elif os_name == "Darwin":
	48	+ # For macOS, FFmpeg uses 'avfoundation'.
	49	+ # device_info should contain the 'device_index' for audio.
	50	+ device_index = device_info.get("device_index", "0")
	51	+ # Example device index: "0" for default
	52	+ cmd = [
	53	+ "ffmpeg",
	54	+ "-f", "avfoundation",
	55	+ "-i", f":{device_index}",
	56	+ "-ar", str(TARGET_RATE),
	57	+ "-ac", str(CHANNELS),
	58	+ "-f", FORMAT,
	59	+ "pipe:1"
	60	+ ]
	61	+ elif os_name == "Linux":
	62	+ # For Linux, FFmpeg uses 'alsa'.
	63	+ # device_info should contain the 'device_name' as recognized by FFmpeg.
	64	+ device_name = device_info.get("name", "default")
	65	+ # Example device name: "default" or "hw:1,0"
	66	+ cmd = [
	67	+ "ffmpeg",
	68	+ "-f", "alsa",
	69	+ "-i", device_name,
	70	+ "-ar", str(TARGET_RATE),
	71	+ "-ac", str(CHANNELS),
	72	+ "-f", FORMAT,
	73	+ "pipe:1"
	74	+ ]
	75	+ else:
	76	+ raise ValueError(f"Unsupported OS: {os_name}")
	77	+
	78	+ return cmd
	79	+
	80	+
	81	+class SpeechToTextClient:
	82	+ """
	83	+ A client that:
	84	+ - Uses FFmpeg to capture and process audio
	85	+ - Initializes a WebSocket connection
	86	+ - Streams raw 16-bit PCM over the WebSocket
	87	+ - Displays transcriptions from the server in the GUI
	88	+ """
	89	+
	90	+ def __init__(self, gui):
	91	+ """
	92	+ :param gui: An instance of the SpeechToTextGUI class for UI callbacks
	93	+ """
	94	+ self.gui = gui
	95	+ self.ws = None
	96	+ self.ffmpeg_process = None
	97	+ self.streaming_thread = None
	98	+ self.running = False
	99	+
	100	+ # Ask user to pick a device
	101	+ mic_info = get_microphone() # Should return a dict with necessary device info
	102	+ self.device_info = mic_info
	103	+
	104	+ # Prepare the FFmpeg command
	105	+ self.ffmpeg_cmd = get_ffmpeg_command(self.device_info)
	106	+
	107	+ def start_recording(self):
	108	+ """Starts FFmpeg, initializes the WebSocket connection, and begins streaming audio."""
	109	+ if self.running:
	110	+ print("Already recording.")
	111	+ return
	112	+
	113	+ self.running = True
	114	+
	115	+ # 1) Start FFmpeg subprocess
	116	+ try:
	117	+ self.ffmpeg_process = subprocess.Popen(
	118	+ self.ffmpeg_cmd,
	119	+ stdout=subprocess.PIPE,
	120	+ stderr=subprocess.DEVNULL, # Suppress FFmpeg stderr; remove if debugging
	121	+ bufsize=10 ** 8
	122	+ )
	123	+ print("FFmpeg started.")
	124	+ except Exception as e:
	125	+ print(f"Failed to start FFmpeg: {e}")
	126	+ self.running = False
	127	+ return
	128	+
	129	+ # 2) Initialize the WebSocket connection
	130	+ self.ws = websocket.WebSocketApp(
	131	+ SERVER_URL,
	132	+ on_message=self.on_message,
	133	+ on_error=self.on_error,
	134	+ on_close=self.on_close
	135	+ )
	136	+ # Run WebSocket in a background thread
	137	+ ws_thread = threading.Thread(target=self.ws.run_forever, daemon=True)
	138	+ ws_thread.start()
	139	+ print("WebSocket connection initiated.")
	140	+
	141	+ # 3) Start audio streaming loop in a separate thread
	142	+ self.streaming_thread = threading.Thread(target=self.audio_stream, daemon=True)
	143	+ self.streaming_thread.start()
	144	+
	145	+ self.gui.update_status("Recording started...")
	146	+
	147	+ def stop_recording(self):
	148	+ """Stops audio streaming, terminates FFmpeg, and closes the WebSocket."""
	149	+ if not self.running:
	150	+ print("Not currently recording.")
	151	+ return
	152	+
	153	+ self.running = False
	154	+
	155	+ # 1) Terminate FFmpeg subprocess
	156	+ if self.ffmpeg_process:
	157	+ self.ffmpeg_process.terminate()
	158	+ self.ffmpeg_process = None
	159	+ print("FFmpeg terminated.")
	160	+
	161	+ # 2) Close WebSocket connection
	162	+ if self.ws:
	163	+ self.ws.close()
	164	+ self.ws = None
	165	+ print("WebSocket connection closed.")
	166	+
	167	+ self.gui.update_status("Recording stopped...")
	168	+
	169	+ def audio_stream(self):
	170	+ """
	171	+ Continuously reads audio data from FFmpeg's stdout and sends it over WebSocket.
	172	+ """
	173	+ try:
	174	+ while self.running:
	175	+ # Read a chunk of data
	176	+ data = self.ffmpeg_process.stdout.read(4096) # Adjust chunk size as needed
	177	+ if not data:
	178	+ print("No more data from FFmpeg.")
	179	+ break
	180	+
	181	+ # Send audio frames over WebSocket (binary)
	182	+ if self.ws and self.ws.sock and self.ws.sock.connected:
	183	+ try:
	184	+ self.ws.send(data, opcode=websocket.ABNF.OPCODE_BINARY)
	185	+ except Exception as e:
	186	+ print(f"Error sending data over WebSocket: {e}")
	187	+ break
	188	+ else:
	189	+ print("WebSocket is not connected.")
	190	+ break
	191	+
	192	+ except Exception as e:
	193	+ print(f"Error during audio streaming: {e}")
	194	+ finally:
	195	+ self.running = False
	196	+ self.stop_recording()
	197	+
	198	+ # ---------------------
	199	+ # WebSocket Callbacks
	200	+ # ---------------------
	201	+ def on_message(self, ws, message):
	202	+ """Handle transcriptions (or other messages) from the server."""
	203	+ print("Received from server:", message)
	204	+ try:
	205	+ data = json.loads(message)
	206	+ transcription = data.get("text", "")
	207	+ if transcription:
	208	+ self.gui.display_transcription(transcription)
	209	+ except json.JSONDecodeError:
	210	+ print("Error: Received invalid JSON:", message)
	211	+
	212	+ def on_error(self, ws, error):
	213	+ """Handle any WebSocket errors."""
	214	+ print("WebSocket Error:", error)
	215	+
	216	+ def on_close(self, ws, close_status_code, close_msg):
	217	+ """Called when the WebSocket connection is closed."""
	218	+ print("WebSocket Closed")
	219	+
	220	+
	221	+class SpeechToTextGUI:
	222	+ """
	223	+ The GUI class for user interaction:
	224	+ - Start/Stop buttons
	225	+ - Status updates
	226	+ - Displays transcriptions
	227	+ - Ties everything together with SpeechToTextClient
	228	+ """
	229	+
	230	+ def __init__(self):
	231	+ self.client = SpeechToTextClient(self)
	232	+
	233	+ # Main window setup
	234	+ self.root = Tk()
	235	+ self.root.title("Speech-to-Text Client")
	236	+
	237	+ # Status label
	238	+ self.status_label = Label(self.root, text="Click 'Start Recording' to begin.", anchor="w")
	239	+ self.status_label.pack(fill="x", padx=10, pady=5)
	240	+
	241	+ # Text area for transcriptions
	242	+ self.text_display = Text(self.root, wrap="word", height=20)
	243	+ self.text_display.pack(fill="both", expand=True, padx=10, pady=5)
	244	+
	245	+ # Scrollbar for transcription area
	246	+ scrollbar = Scrollbar(self.text_display)
	247	+ scrollbar.pack(side="right", fill="y")
	248	+ self.text_display.config(yscrollcommand=scrollbar.set)
	249	+ scrollbar.config(command=self.text_display.yview)
	250	+
	251	+ # Start/Stop Buttons
	252	+ start_button = Button(
	253	+ self.root,
	254	+ text="Start Recording",
	255	+ command=self.client.start_recording,
	256	+ bg="green",
	257	+ fg="white"
	258	+ )
	259	+ start_button.pack(side="left", padx=10, pady=10)
	260	+
	261	+ stop_button = Button(
	262	+ self.root,
	263	+ text="Stop Recording",
	264	+ command=self.client.stop_recording,
	265	+ bg="red",
	266	+ fg="white"
	267	+ )
	268	+ stop_button.pack(side="right", padx=10, pady=10)
	269	+
	270	+ # Handle window close event to ensure subprocesses are terminated
	271	+ self.root.protocol("WM_DELETE_WINDOW", self.on_close)
	272	+
	273	+ def update_status(self, message):
	274	+ """Updates the status label."""
	275	+ self.status_label.config(text=message)
	276	+
	277	+ def display_transcription(self, transcription):
	278	+ """Appends transcriptions to the text box and scrolls to the end."""
	279	+ if transcription:
	280	+ self.text_display.insert(END, transcription + "\n")
	281	+ self.text_display.see(END) # Auto-scroll
	282	+
	283	+ def on_close(self):
	284	+ """Handle the window close event."""
	285	+ self.client.stop_recording()
	286	+ self.root.destroy()
	287	+
	288	+ def run(self):
	289	+ """Start the Tkinter event loop."""
	290	+ self.root.mainloop()
	291	+
	292	+
	293	+if __name__ == "__main__":
	294	+ gui = SpeechToTextGUI()
	295	+ gui.run()

Add a comment

Open 0
Closed 0

List

Delete comment