윤영준 윤영준 01-17
Hello Yona
@9bb012f1031c90b35455366c2d904b48debd770c
 
README.md (added)
+++ README.md
@@ -0,0 +1,1 @@
+# whisper_client
 
client.html (added)
+++ client.html
@@ -0,0 +1,483 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Real-Time Audio Transcription with VAD and Volume Meter</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 40px;
+            background-color: #f5f5f5;
+        }
+        h1 {
+            text-align: center;
+        }
+        #controls {
+            text-align: center;
+            margin-bottom: 20px;
+        }
+        #recordBtn {
+            padding: 15px 30px;
+            font-size: 18px;
+            border: none;
+            border-radius: 5px;
+            background-color: #28a745; /* Green */
+            color: white;
+            cursor: pointer;
+            transition: background-color 0.3s ease;
+        }
+        #recordBtn.recording {
+            background-color: #dc3545; /* Red */
+        }
+        #transcriptions {
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+            height: 600px;
+            overflow-y: auto;
+            white-space: pre-wrap;
+            font-size: 16px;
+        }
+        .transcription {
+            margin-bottom: 10px;
+        }
+        .notice {
+            color: #dc3545; /* Red */
+            font-style: italic;
+        }
+        /* Volume Meter Styles */
+        #volumeMeter {
+            width: 300px;
+            height: 30px;
+            background-color: #e0e0e0;
+            border-radius: 15px;
+            overflow: hidden;
+            margin: 20px auto;
+            position: relative;
+        }
+        #volumeBar {
+            height: 100%;
+            width: 0%;
+            background-color: #28a745;
+            transition: width 0.1s ease, background-color 0.1s ease;
+        }
+    </style>
+</head>
+<body>
+    <h1>Real-Time Audio Transcription with VAD and Volume Meter</h1>
+    <div id="controls">
+        <button id="recordBtn">Start Recording</button>
+    </div>
+    <!-- Volume Meter -->
+    <div id="volumeMeter">
+        <div id="volumeBar"></div>
+    </div>
+    <div id="transcriptions"></div>
+
+    <!-- Include ONNX Runtime Web -->
+    <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"></script>
+    <!-- Include VAD-Web -->
+    <script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script>
+
+    <script>
+        // Elements
+        const recordBtn = document.getElementById('recordBtn');
+        const transcriptionsDiv = document.getElementById('transcriptions');
+        const volumeBar = document.getElementById('volumeBar');
+
+        // State Variables
+        let isRecording = false;
+        let vadInstance = null; // Renamed to avoid conflict
+        let ws = null;
+        let audioContext = null;
+        let analyser = null;
+        let microphoneStream = null;
+        let dataArray = null;
+        let animationId = null;
+        let reconnectInterval = 3000; // 3 seconds
+        let shouldReconnect = false; // Flag to control reconnection
+
+        // Configuration
+        const WS_ENDPOINT = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"; // Ensure this is correct
+
+        // Buffer to hold incoming data for JSON parsing
+        let incomingBuffer = '';
+
+        // Utility Functions
+
+        /**
+         * Logs transcription text with colored words based on probability.
+         * @param {Array} words - Array of word objects with 'word' and 'probability'.
+         */
+        function logTranscription(words) {
+            const transcriptionLine = document.createElement('div');
+            transcriptionLine.classList.add('transcription');
+
+            words.forEach(wordObj => {
+                const span = document.createElement('span');
+                span.textContent = wordObj.word + ' '; // Add space after each word
+
+                // Calculate hue: 0 (red) to 240 (blue)
+                const hue = wordObj.probability * 240;
+                span.style.color = `hsl(${hue}, 100%, 50%)`;
+
+                transcriptionLine.appendChild(span);
+            });
+
+            transcriptionsDiv.appendChild(transcriptionLine);
+            transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
+        }
+
+        /**
+         * Logs notice messages (e.g., connection status, errors).
+         * @param {string} text - The notice text to display.
+         */
+        function logNotice(text) {
+            const p = document.createElement('p');
+            p.classList.add('notice');
+            p.textContent = text;
+            transcriptionsDiv.appendChild(p);
+            transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
+        }
+
+        /**
+         * Converts Float32 audio data to Int16 PCM format.
+         * @param {Float32Array} buffer - The audio buffer in Float32 format.
+         * @returns {Int16Array} - The audio buffer in Int16 format.
+         */
+        function convertFloat32ToInt16(buffer) {
+            let l = buffer.length;
+            const buf = new Int16Array(l);
+            while (l--) {
+                buf[l] = Math.min(1, buffer[l]) * 0x7FFF;
+            }
+            return buf;
+        }
+
+        /**
+         * Extracts JSON objects from a concatenated string.
+         * @param {string} buffer - The concatenated JSON string.
+         * @returns {Array} - An array of parsed JSON objects.
+         */
+        function extractJSONObjects(buffer) {
+            const objects = [];
+            let braceStack = 0;
+            let inString = false;
+            let escape = false;
+            let lastSplit = 0;
+
+            for (let i = 0; i < buffer.length; i++) {
+                const char = buffer[i];
+
+                if (char === '"' && !escape) {
+                    inString = !inString;
+                }
+
+                if (!inString) {
+                    if (char === '{') {
+                        braceStack++;
+                    } else if (char === '}') {
+                        braceStack--;
+                        if (braceStack === 0) {
+                            const jsonString = buffer.slice(lastSplit, i + 1);
+                            try {
+                                const jsonObj = JSON.parse(jsonString);
+                                objects.push(jsonObj);
+                            } catch (e) {
+                                console.error('Failed to parse JSON:', e);
+                            }
+                            lastSplit = i + 1;
+                        }
+                    }
+                }
+
+                // Handle escape characters
+                if (char === '\\' && !escape) {
+                    escape = true;
+                } else {
+                    escape = false;
+                }
+            }
+
+            // Return any remaining buffer that wasn't parsed
+            incomingBuffer = buffer.slice(lastSplit);
+            return objects;
+        }
+
+        // WebSocket Handlers
+
+        /**
+         * Sets up the WebSocket connection and defines event handlers.
+         */
+        function setupWebSocket() {
+            ws = new WebSocket(WS_ENDPOINT);
+            ws.binaryType = 'arraybuffer';
+
+            ws.onopen = () => {
+                console.log('WebSocket connection opened.');
+                logNotice("WebSocket connection established.");
+            };
+
+            ws.onmessage = (event) => {
+                let messageData = '';
+
+                if (typeof event.data === 'string') {
+                    messageData = event.data;
+                } else if (event.data instanceof ArrayBuffer) {
+                    const decoder = new TextDecoder('utf-8');
+                    messageData = decoder.decode(event.data);
+                } else {
+                    console.warn('Unsupported message format:', event.data);
+                    return;
+                }
+
+                // Append incoming data to buffer
+                incomingBuffer += messageData;
+
+                // Extract JSON objects
+                const jsonObjects = extractJSONObjects(incomingBuffer);
+
+                // Process each JSON object
+                jsonObjects.forEach(obj => {
+                    if (obj.task === "transcribe" && Array.isArray(obj.words)) {
+                        logTranscription(obj.words);
+                    }
+                });
+            };
+
+            ws.onclose = (event) => {
+                console.log('WebSocket connection closed:', event);
+                logNotice("WebSocket connection closed.");
+                ws = null;
+
+                if (isRecording && shouldReconnect) {
+                    logNotice("Attempting to reconnect...");
+                    setTimeout(() => {
+                        setupWebSocket();
+                    }, reconnectInterval);
+                } else if (isRecording) {
+                    logNotice("Transcription session ended.");
+                    stopRecording(true); // true indicates server-initiated stop
+                }
+            };
+
+            ws.onerror = (error) => {
+                console.error('WebSocket error:', error);
+                logNotice("WebSocket encountered an error.");
+            };
+        }
+
+        // Voice Activity Detection Setup
+
+        /**
+         * Initializes the Voice Activity Detector (VAD) using Silero VAD.
+         */
+        async function initializeVAD(stream) {
+            try {
+                vadInstance = await vad.MicVAD.new({
+                    stream: stream, // Pass the existing MediaStream to avoid multiple microphone accesses
+                    onSpeechStart: () => {
+                        console.log("Speech start detected");
+                        logNotice("Speech detected...");
+                    },
+                    onSpeechEnd: (audio) => {
+                        console.log("Speech end detected");
+                        logNotice("Sending speech segment to server...");
+
+                        // Convert Float32Array to Int16Array
+                        const int16Audio = convertFloat32ToInt16(audio);
+
+                        // Send the audio buffer via WebSocket
+                        if (ws && ws.readyState === WebSocket.OPEN) {
+                            ws.send(int16Audio.buffer);
+                        } else {
+                            console.warn('WebSocket is not open. Cannot send audio.');
+                            logNotice("WebSocket is not open. Audio segment not sent.");
+                        }
+                    }
+                });
+            } catch (error) {
+                console.error('Error initializing VAD:', error);
+                logNotice("Error initializing Voice Activity Detection.");
+            }
+        }
+
+        // Volume Meter Setup
+
+        /**
+         * Sets up the volume meter using the Web Audio API.
+         */
+        async function setupVolumeMeter(stream) {
+            try {
+                // Initialize AudioContext
+                audioContext = new (window.AudioContext || window.webkitAudioContext)();
+
+                // Create MediaStreamSource from the existing stream
+                microphoneStream = audioContext.createMediaStreamSource(stream);
+
+                // Create AnalyserNode
+                analyser = audioContext.createAnalyser();
+                analyser.fftSize = 512;
+                const bufferLength = analyser.frequencyBinCount;
+                dataArray = new Uint8Array(bufferLength);
+
+                // Connect microphone to analyser
+                microphoneStream.connect(analyser);
+
+                // Start visualizing
+                visualize();
+            } catch (error) {
+                console.error('Error setting up volume meter:', error);
+                logNotice("Error setting up volume meter.");
+            }
+        }
+
+        /**
+         * Visualizes the volume level on the volume meter.
+         */
+        function visualize() {
+            const updateVolume = () => {
+                analyser.getByteFrequencyData(dataArray);
+                let sum = 0;
+                for (let i = 0; i < dataArray.length; i++) {
+                    sum += dataArray[i];
+                }
+                const average = sum / dataArray.length;
+                const volume = average / 255; // Normalize to [0,1]
+
+                // Update the volume bar width
+                volumeBar.style.width = `${volume * 100}%`;
+
+                // Change color based on volume level (green to red)
+                const hue = (1 - volume) * 120; // 120 (green) to 0 (red)
+                volumeBar.style.backgroundColor = `hsl(${hue}, 100%, 50%)`;
+
+                animationId = requestAnimationFrame(updateVolume);
+            };
+
+            updateVolume();
+        }
+
+        /**
+         * Stops the volume meter visualization.
+         */
+        function stopVolumeMeter() {
+            if (animationId) {
+                cancelAnimationFrame(animationId);
+                animationId = null;
+            }
+            if (volumeBar) {
+                volumeBar.style.width = '0%';
+                volumeBar.style.backgroundColor = '#28a745'; // Reset to green
+            }
+            if (analyser) {
+                analyser.disconnect();
+                analyser = null;
+            }
+            if (microphoneStream) {
+                microphoneStream.disconnect();
+                microphoneStream = null;
+            }
+            if (audioContext) {
+                audioContext.close();
+                audioContext = null;
+            }
+        }
+
+        // Recording Control Functions
+
+        /**
+         * Starts the Voice Activity Detection, Volume Meter, and WebSocket connection.
+         */
+        async function startRecording() {
+            try {
+                // Request microphone access once
+                const stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
+
+                // Set up Volume Meter
+                await setupVolumeMeter(stream);
+
+                // Initialize VAD with the same stream
+                await initializeVAD(stream);
+
+                // Set up WebSocket
+                shouldReconnect = true; // Enable reconnection attempts
+                setupWebSocket();
+
+                // Start VAD
+                if (vadInstance) {
+                    vadInstance.start();
+                }
+
+                // Update UI
+                isRecording = true;
+                recordBtn.textContent = 'Stop Recording';
+                recordBtn.classList.add('recording');
+                logNotice("Recording started. Speak into your microphone.");
+            } catch (error) {
+                console.error('Error starting recording:', error);
+                logNotice("Error starting recording. Please try again.");
+            }
+        }
+
+        /**
+         * Stops the Voice Activity Detection, Volume Meter, and cleans up resources.
+         * @param {boolean} serverInitiated - Indicates if the stop was triggered by the server.
+         */
+        function stopRecording(serverInitiated = false) {
+            if (!isRecording) return;
+
+            // Stop VAD
+            if (vadInstance) {
+                vadInstance.pause();
+                vadInstance = null;
+            }
+
+            // Stop Volume Meter
+            stopVolumeMeter();
+
+            // Prevent reconnection if stopping manually
+            if (!serverInitiated) {
+                shouldReconnect = false;
+            }
+
+            // Close WebSocket if not server-initiated
+            if (!serverInitiated && ws && ws.readyState === WebSocket.OPEN) {
+                ws.send(JSON.stringify({ action: "terminate" }));
+                logNotice("Termination signal sent to server.");
+            }
+
+            // Close WebSocket
+            if (ws) {
+                ws.close();
+                ws = null;
+            }
+
+            // Reset recording state
+            isRecording = false;
+            recordBtn.textContent = 'Start Recording';
+            recordBtn.classList.remove('recording');
+            logNotice("Recording stopped.");
+        }
+
+        // Button Event Listener
+
+        /**
+         * Toggles recording state when the record button is clicked.
+         */
+        recordBtn.addEventListener('click', () => {
+            if (!isRecording) {
+                startRecording().catch(error => {
+                    console.error('Error starting recording:', error);
+                    logNotice("Error starting recording. Please try again.");
+                });
+            } else {
+                stopRecording();
+            }
+        });
+    </script>
+</body>
+</html>
 
client_with_openAI.html (added)
+++ client_with_openAI.html
@@ -0,0 +1,643 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Real-Time Audio Transcription with VAD and Volume Meter</title>
+    <style>
+        /* Your existing CSS styles */
+        body {
+            --indicator-color: black;
+            background: radial-gradient(black 55%, var(--indicator-color));
+            min-height: 100vh;
+            color: white;
+            margin: 0;
+            font-family: Arial, sans-serif;
+        }
+        h1 {
+            text-align: center;
+            margin-top: 20px;
+        }
+        #controls {
+            text-align: center;
+            margin: 20px;
+        }
+        #toggle_vad_button {
+            padding: 10px 20px;
+            font-size: 16px;
+            border: none;
+            border-radius: 5px;
+            background-color: #28a745; /* Green */
+            color: white;
+            cursor: pointer;
+            transition: background-color 0.3s ease;
+        }
+        #toggle_vad_button.recording {
+            background-color: #dc3545; /* Red */
+        }
+        #indicator {
+            text-align: center;
+            margin: 10px;
+            font-size: 18px;
+        }
+        #playlist {
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: rgba(255, 255, 255, 0.1);
+            border-radius: 8px;
+            height: 400px;
+            overflow-y: scroll;
+            list-style: none;
+            padding-left: 0;
+        }
+        #playlist li {
+            margin-bottom: 10px;
+            opacity: 0;
+            animation: fadeIn 1s forwards;
+        }
+        #playlist li.newItem {
+            border-left: 4px solid #28a745;
+            padding-left: 10px;
+        }
+        .transcription {
+            color: white;
+            font-size: 16px;
+        }
+        .notice {
+            color: #dc3545; /* Red */
+            font-style: italic;
+        }
+        @keyframes fadeIn {
+            to {
+                opacity: 1;
+            }
+        }
+    </style>
+</head>
+<body>
+    <h1>Real-Time Audio Transcription with VAD and Volume Meter</h1>
+    <div id="controls">
+        <button id="toggle_vad_button" onclick="window.toggleVAD()" disabled>START VAD</button>
+    </div>
+    <div id="indicator">VAD is <span style="color: red">LOADING</span></div>
+    <ol id="playlist" reversed></ol>
+
+    <!-- Include ONNX Runtime Web -->
+    <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"></script>
+    <!-- Include VAD-Web -->
+    <script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script>
+
+    <script type="module">
+        import { interpolateInferno } from "https://cdn.skypack.dev/d3-scale-chromatic@3";
+
+        // Elements
+        const recordBtn = document.getElementById('toggle_vad_button');
+        const transcriptionsDiv = document.getElementById('playlist');
+        const indicator = document.getElementById('indicator');
+
+        // State Variables
+        let isRecording = false;
+        let vadInstance = null;
+        let audioContext = null;
+        let analyser = null;
+        let microphoneStream = null;
+        let dataArray = null;
+        let animationId = null;
+        let isSpeaking = false;
+        let audioBuffer = [];
+        let sendAudioInterval = null;
+        const SEND_INTERVAL_MS = 1000; // 1 second
+        let ws = null;
+        let incomingBuffer = '';
+        let fullTranscription = ''; // To accumulate full transcription
+
+        // Configuration
+        const WS_ENDPOINT = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"; // Ensure this is correct
+        const BACKEND_UPLOAD_URL = "http://localhost:3000/upload-audio"; // Replace with your backend URL
+        const BACKEND_LLM_URL = "http://localhost:3000/process-transcription"; // Replace with your backend URL
+
+        // Utility Functions
+
+        /**
+         * Logs transcription text with colored words based on probability.
+         * @param {Array} words - Array of word objects with 'word' and 'probability'.
+         */
+        function logTranscription(words) {
+            const transcriptionLine = document.createElement('div');
+            transcriptionLine.classList.add('transcription');
+
+            words.forEach(wordObj => {
+                const span = document.createElement('span');
+                span.textContent = wordObj.word + ' '; // Add space after each word
+
+                // Calculate hue: 0 (red) to 240 (blue)
+                const hue = wordObj.probability * 240;
+                span.style.color = `hsl(${hue}, 100%, 50%)`;
+
+                transcriptionLine.appendChild(span);
+                fullTranscription += wordObj.word + ' ';
+            });
+
+            transcriptionsDiv.prepend(transcriptionLine);
+            transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
+        }
+
+        /**
+         * Logs notice messages (e.g., connection status, errors).
+         * @param {string} text - The notice text to display.
+         */
+        function logNotice(text) {
+            const p = document.createElement('p');
+            p.classList.add('notice');
+            p.textContent = text;
+            transcriptionsDiv.prepend(p);
+            transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
+        }
+
+        /**
+         * Converts Float32 audio data to Int16 PCM format.
+         * @param {Float32Array} buffer - The audio buffer in Float32 format.
+         * @returns {Int16Array} - The audio buffer in Int16 format.
+         */
+        function convertFloat32ToInt16(buffer) {
+            let l = buffer.length;
+            const buf = new Int16Array(l);
+            while (l--) {
+                buf[l] = Math.min(1, buffer[l]) * 0x7FFF;
+            }
+            return buf;
+        }
+
+        /**
+         * Extracts JSON objects from a concatenated string.
+         * @param {string} buffer - The concatenated JSON string.
+         * @returns {Array} - An array of parsed JSON objects.
+         */
+        function extractJSONObjects(buffer) {
+            const objects = [];
+            let braceStack = 0;
+            let inString = false;
+            let escape = false;
+            let lastSplit = 0;
+
+            for (let i = 0; i < buffer.length; i++) {
+                const char = buffer[i];
+
+                if (char === '"' && !escape) {
+                    inString = !inString;
+                }
+
+                if (!inString) {
+                    if (char === '{') {
+                        braceStack++;
+                    } else if (char === '}') {
+                        braceStack--;
+                        if (braceStack === 0) {
+                            const jsonString = buffer.slice(lastSplit, i + 1);
+                            try {
+                                const jsonObj = JSON.parse(jsonString);
+                                objects.push(jsonObj);
+                            } catch (e) {
+                                console.error('Failed to parse JSON:', e);
+                            }
+                            lastSplit = i + 1;
+                        }
+                    }
+                }
+
+                // Handle escape characters
+                if (char === '\\' && !escape) {
+                    escape = true;
+                } else {
+                    escape = false;
+                }
+            }
+
+            // Return any remaining buffer that wasn't parsed
+            incomingBuffer = buffer.slice(lastSplit);
+            return objects;
+        }
+
+        // WebSocket Handlers
+
+        /**
+         * Sets up the WebSocket connection and defines event handlers.
+         */
+        function setupWebSocket() {
+            ws = new WebSocket(WS_ENDPOINT);
+            ws.binaryType = 'arraybuffer';
+
+            ws.onopen = () => {
+                console.log('WebSocket connection opened.');
+                logNotice("WebSocket connection established.");
+            };
+
+            ws.onmessage = (event) => {
+                let messageData = '';
+
+                if (typeof event.data === 'string') {
+                    messageData = event.data;
+                } else if (event.data instanceof ArrayBuffer) {
+                    const decoder = new TextDecoder('utf-8');
+                    messageData = decoder.decode(event.data);
+                } else {
+                    console.warn('Unsupported message format:', event.data);
+                    return;
+                }
+
+                // Append incoming data to buffer
+                incomingBuffer += messageData;
+
+                // Extract JSON objects
+                const jsonObjects = extractJSONObjects(incomingBuffer);
+
+                // Process each JSON object
+                jsonObjects.forEach(obj => {
+                    if (obj.task === "transcribe" && Array.isArray(obj.words)) {
+                        logTranscription(obj.words);
+                    }
+                });
+            };
+
+            ws.onclose = (event) => {
+                console.log('WebSocket connection closed:', event);
+                logNotice("WebSocket connection closed.");
+                ws = null;
+
+                if (isRecording && shouldReconnect) {
+                    logNotice("Attempting to reconnect...");
+                    setTimeout(() => {
+                        setupWebSocket();
+                    }, reconnectInterval);
+                } else if (isRecording) {
+                    logNotice("Transcription session ended.");
+                    stopRecording(true); // true indicates server-initiated stop
+                }
+            };
+
+            ws.onerror = (error) => {
+                console.error('WebSocket error:', error);
+                logNotice("WebSocket encountered an error.");
+            };
+        }
+
+        // Voice Activity Detection Setup
+
+        /**
+         * Initializes the Voice Activity Detector (VAD) using MicVAD.
+         */
+        async function initializeVAD(stream) {
+            try {
+                vadInstance = await vad.MicVAD.new({
+                    stream: stream, // Pass the existing MediaStream to avoid multiple microphone accesses
+                    onSpeechStart: () => {
+                        console.log("Speech start detected");
+                        logNotice("Speech detected...");
+
+                        isSpeaking = true;
+                        audioBuffer = []; // Reset buffer
+
+                        // Start timer to send audio every second
+                        sendAudioInterval = setInterval(sendAudio, SEND_INTERVAL_MS);
+                    },
+                    onSpeechEnd: (audio) => {
+                        console.log("Speech end detected");
+                        logNotice("Sending final speech segment to server...");
+
+                        isSpeaking = false;
+
+                        // Send any remaining audio
+                        sendAudio();
+
+                        // Stop the timer
+                        if (sendAudioInterval) {
+                            clearInterval(sendAudioInterval);
+                            sendAudioInterval = null;
+                        }
+
+                        // Optionally, send the final `audio` provided by the callback
+                        // depending on your application's needs
+                        // Example:
+                        // sendFinalAudio(audio);
+                    },
+                    onFrameProcessed: (probabilities, frame) => {
+                        const indicatorColor = interpolateInferno(probabilities.isSpeech / 2);
+                        document.body.style.setProperty("--indicator-color", indicatorColor);
+
+                        if (isSpeaking) {
+                            audioBuffer.push(frame);
+                        }
+                    },
+                });
+
+                window.vadInstance = vadInstance;
+
+                // Start VAD listening
+                vadInstance.start();
+                isRecording = true;
+                recordBtn.textContent = 'STOP VAD';
+                recordBtn.classList.add('recording');
+                logNotice("Recording started. Speak into your microphone.");
+            } catch (error) {
+                console.error('Error initializing VAD:', error);
+                logNotice("Error initializing Voice Activity Detection.");
+            }
+        }
+
+        // Volume Meter Setup (Optional, based on your requirements)
+
+        /**
+         * Sets up the volume meter using the Web Audio API.
+         */
+        async function setupVolumeMeter(stream) {
+            try {
+                // Initialize AudioContext
+                audioContext = new (window.AudioContext || window.webkitAudioContext)();
+
+                // Create MediaStreamSource from the existing stream
+                microphoneStream = audioContext.createMediaStreamSource(stream);
+
+                // Create AnalyserNode
+                analyser = audioContext.createAnalyser();
+                analyser.fftSize = 512;
+                const bufferLength = analyser.frequencyBinCount;
+                dataArray = new Uint8Array(bufferLength);
+
+                // Connect microphone to analyser
+                microphoneStream.connect(analyser);
+
+                // Start visualizing
+                visualize();
+            } catch (error) {
+                console.error('Error setting up volume meter:', error);
+                logNotice("Error setting up volume meter.");
+            }
+        }
+
+        /**
+         * Visualizes the volume level on the volume meter.
+         */
+        function visualize() {
+            const updateVolume = () => {
+                analyser.getByteFrequencyData(dataArray);
+                let sum = 0;
+                for (let i = 0; i < dataArray.length; i++) {
+                    sum += dataArray[i];
+                }
+                const average = sum / dataArray.length;
+                const volume = average / 255; // Normalize to [0,1]
+
+                // Update the volume bar width
+                volumeBar.style.width = `${volume * 100}%`;
+
+                // Change color based on volume level (green to red)
+                const hue = (1 - volume) * 120; // 120 (green) to 0 (red)
+                volumeBar.style.backgroundColor = `hsl(${hue}, 100%, 50%)`;
+
+                animationId = requestAnimationFrame(updateVolume);
+            };
+
+            updateVolume();
+        }
+
+        /**
+         * Stops the volume meter visualization.
+         */
+        function stopVolumeMeter() {
+            if (animationId) {
+                cancelAnimationFrame(animationId);
+                animationId = null;
+            }
+            if (volumeBar) {
+                volumeBar.style.width = '0%';
+                volumeBar.style.backgroundColor = '#28a745'; // Reset to green
+            }
+            if (analyser) {
+                analyser.disconnect();
+                analyser = null;
+            }
+            if (microphoneStream) {
+                microphoneStream.disconnect();
+                microphoneStream = null;
+            }
+            if (audioContext) {
+                audioContext.close();
+                audioContext = null;
+            }
+        }
+
+        // LLM Integration
+
+        /**
+         * Sends the transcription to the backend server for LLM processing.
+         * @param {string} transcription - The transcribed text.
+         */
+        async function sendTranscriptionToLLM(transcription) {
+            try {
+                const response = await fetch(BACKEND_LLM_URL, { // Adjust the URL if your server is hosted elsewhere
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({ transcription }),
+                });
+
+                if (!response.ok) {
+                    throw new Error(`Server error: ${response.status}`);
+                }
+
+                const data = await response.json();
+                if (data.llmResponse) {
+                    displayLLMResponse(data.llmResponse);
+                }
+            } catch (error) {
+                console.error('Error sending transcription to LLM:', error);
+                logNotice("Error processing transcription with LLM.");
+            }
+        }
+
+        /**
+         * Displays the LLM's response in the transcriptions div.
+         * @param {object} llmResponse - The response from the LLM.
+         */
+        function displayLLMResponse(llmResponse) {
+            // Adjust based on your LLM's response structure
+            const responseText = llmResponse.choices && llmResponse.choices[0] && llmResponse.choices[0].message && llmResponse.choices[0].message.content
+                ? llmResponse.choices[0].message.content
+                : 'No response from LLM.';
+
+            const responseLine = document.createElement('div');
+            responseLine.classList.add('transcription');
+
+            const span = document.createElement('span');
+            span.textContent = `LLM Response: ${responseText}`;
+            span.style.color = `hsl(200, 100%, 50%)`; // Example color
+
+            responseLine.appendChild(span);
+            transcriptionsDiv.prepend(responseLine);
+            transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
+        }
+
+        /**
+         * Sends the accumulated audio to the server.
+         */
+        async function sendAudioToServer(audioBuffer) {
+            try {
+                const response = await fetch(BACKEND_UPLOAD_URL, { // Replace with your backend URL
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/octet-stream', // Adjust based on server expectations
+                    },
+                    body: audioBuffer,
+                });
+
+                if (!response.ok) {
+                    throw new Error(`Server responded with status ${response.status}`);
+                }
+
+                console.log('Audio sent successfully');
+            } catch (error) {
+                console.error('Error sending audio:', error);
+                logNotice("Error sending audio to server.");
+            }
+        }
+
+        /**
+         * Adds an audio element to the playlist.
+         * @param {string} audioUrl - The data URL of the audio.
+         * @returns {HTMLElement} - The created list item element.
+         */
+        function addAudio(audioUrl) {
+            const entry = document.createElement("li");
+            const audio = document.createElement("audio");
+            audio.controls = true;
+            audio.src = audioUrl;
+            entry.classList.add("newItem");
+            entry.appendChild(audio);
+            return entry;
+        }
+
+        // Recording Control Functions
+
+        /**
+         * Starts the Voice Activity Detection, Volume Meter, and WebSocket connection.
+         */
+        async function startRecording() {
+            try {
+                // Request microphone access once
+                const stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
+
+                // Optionally, set up Volume Meter
+                // await setupVolumeMeter(stream);
+
+                // Initialize VAD with the same stream
+                await initializeVAD(stream);
+
+                // Set up WebSocket
+                setupWebSocket();
+            } catch (error) {
+                console.error('Error starting recording:', error);
+                logNotice("Error starting recording. Please try again.");
+            }
+        }
+
+        /**
+         * Stops the Voice Activity Detection, Volume Meter, and cleans up resources.
+         * @param {boolean} serverInitiated - Indicates if the stop was triggered by the server.
+         */
+        function stopRecording(serverInitiated = false) {
+            if (!isRecording) return;
+
+            // Stop VAD
+            if (vadInstance) {
+                if (typeof vadInstance.pause === 'function') {
+                    vadInstance.pause();
+                } else {
+                    console.warn('VAD instance does not have a pause method.');
+                }
+                vadInstance = null;
+            }
+
+            // Optionally, stop Volume Meter
+            // stopVolumeMeter();
+
+            // Prevent reconnection if stopping manually
+            if (!serverInitiated) {
+                shouldReconnect = false;
+            }
+
+            // Close WebSocket if not server-initiated
+            if (!serverInitiated && ws && ws.readyState === WebSocket.OPEN) {
+                ws.send(JSON.stringify({ action: "terminate" }));
+                logNotice("Termination signal sent to server.");
+            }
+
+            // Close WebSocket
+            if (ws) {
+                ws.close();
+                ws = null;
+            }
+
+            // Reset recording state
+            isRecording = false;
+            recordBtn.textContent = 'START VAD';
+            recordBtn.classList.remove('recording');
+            logNotice("Recording stopped.");
+
+            // Send the full transcription to the LLM
+            if (fullTranscription.trim().length > 0) {
+                sendTranscriptionToLLM(fullTranscription.trim());
+                fullTranscription = ''; // Reset after sending
+            }
+        }
+
+        /**
+         * Sends the accumulated audio to the server periodically.
+         */
+        async function sendAudio() {
+            if (audioBuffer.length === 0) return;
+
+            // Concatenate all frames into a single Float32Array
+            const totalLength = audioBuffer.reduce((sum, frame) => sum + frame.length, 0);
+            const concatenated = new Float32Array(totalLength);
+            let offset = 0;
+            audioBuffer.forEach(frame => {
+                concatenated.set(frame, offset);
+                offset += frame.length;
+            });
+
+            // Encode to WAV format
+            const wavBuffer = vad.utils.encodeWAV(concatenated);
+
+            // Send the audio to the server
+            await sendAudioToServer(wavBuffer);
+
+            // Optionally, add the audio to the UI
+            const base64 = vad.utils.arrayBufferToBase64(wavBuffer);
+            const audioUrl = `data:audio/wav;base64,${base64}`;
+            const audioElement = addAudio(audioUrl);
+            transcriptionsDiv.prepend(audioElement);
+
+            // Reset the buffer
+            audioBuffer = [];
+        }
+
+        // Button Event Listener
+
+        /**
+         * Toggles recording state when the record button is clicked.
+         */
+        window.toggleVAD = () => {
+            console.log("ran toggle vad");
+            if (!isRecording) {
+                startRecording().catch(error => {
+                    console.error('Error starting recording:', error);
+                    logNotice("Error starting recording. Please try again.");
+                });
+            } else {
+                stopRecording();
+            }
+        };
+    </script>
+</body>
+</html>
 
get_microphone.py (added)
+++ get_microphone.py
@@ -0,0 +1,96 @@
+import pyaudio
+import wave
+
+pa = pyaudio.PyAudio()
+
+def get_microphone():
+    """
+    creates cli prompt that lists all microphones available, and waits for user choices, and returns values that is needed to open microphone from pyaudio.
+    uses dict as return type for ... I am very dumb and can not remember every line of the code.
+    :return: {
+        "device_num": device_num,
+        "microphone_channel_num": microphone_channel_num,
+        "microphone_sample_rate": microphone_sample_rate,
+    }
+    """
+
+    def get_valid_integer(prompt, min_value, max_value):
+        """
+        Prompt the user for an integer input within a specified range.
+        Sanitizes non-integer input and values outside the range.
+
+        :param prompt: The input prompt message
+        :param min_value: Minimum acceptable integer value (inclusive)
+        :param max_value: Maximum acceptable integer value (inclusive)
+        :return: A valid integer within the range [min_value, max_value]
+        """
+        while True:
+            try:
+                user_input = input(prompt)
+
+                value = int(user_input)
+
+                if min_value <= value <= max_value:
+                    return value
+                else:
+                    print(f"Error: Please enter an integer between {min_value} and {max_value}.")
+            except ValueError:
+                print("Error: Invalid input. Please enter an integer.")
+
+    # List all devices to see their indexes
+    audio_list = []
+    for i in range(pa.get_device_count()):
+        dev = pa.get_device_info_by_index(i)
+        print(
+            i,
+            dev['name'],
+            dev['maxInputChannels'],
+            dev['defaultSampleRate']
+        )
+        audio_list.append(dev)
+
+    mesg = "Which device is the microphone you are using?"
+    device_num = get_valid_integer(mesg, 0, len(audio_list) - 1)
+
+    microphone_channel_num = audio_list[device_num]['maxInputChannels']
+    # must pass int type, pyaudio only accepts int type
+    microphone_sample_rate = int(audio_list[device_num]['defaultSampleRate'])
+    try:
+        stream = pa.open(
+            format=pyaudio.paInt16,
+            channels=microphone_channel_num,
+            rate=microphone_sample_rate,
+            input=True,
+            frames_per_buffer=1024,
+            input_device_index=device_num,
+        )
+        # Save to a valid WAV file
+
+        frames = []
+        for _ in range(0, int(microphone_sample_rate / 1024 * 1)):  # Record for 5 seconds
+            data = stream.read(1024)
+            frames.append(data)
+        print("Recording finished.")
+
+        output_filename = "test.wav"
+        with wave.open(output_filename, "wb") as wf:
+            wf.setnchannels(microphone_channel_num)
+            wf.setsampwidth(pa.get_sample_size(pyaudio.paInt16))
+            wf.setframerate(microphone_sample_rate)
+            wf.writeframes(b"".join(frames))
+        print("Recorded some data:", len(data))
+        print("Looks Good To Me.")
+        stream.close()
+        pa.terminate()
+    except Exception as e:
+        print("Something went wrong. Can not open the audio device")
+        print(e)
+
+    return {
+        "device_num": device_num,
+        "microphone_channel_num": microphone_channel_num,
+        "microphone_sample_rate": microphone_sample_rate,
+    }
+
+if __name__ == "__main__":
+    print(get_microphone())(파일 끝에 줄바꿈 문자 없음)
 
websocket_client.py (added)
+++ websocket_client.py
@@ -0,0 +1,295 @@
+import threading
+import subprocess
+import sys
+import platform
+import websocket
+import json
+import shlex
+
+from tkinter import Tk, Button, Label, Text, Scrollbar, END
+
+# Import the microphone selection helper
+from get_microphone import get_microphone
+
+# 1) Server configuration
+SERVER_URL = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"
+
+# 2) Audio configuration
+TARGET_RATE = 16000  # Resample to 16 kHz for the server
+CHANNELS = 1  # Mono
+FORMAT = 's16le'  # 16-bit PCM little endian
+
+
+# 3) FFmpeg configuration
+def get_ffmpeg_command(device_info):
+    """
+    Constructs the FFmpeg command based on the operating system and selected device.
+
+    :param device_info: Dictionary containing device information from get_microphone()
+    :return: List of FFmpeg command arguments
+    """
+    os_name = platform.system()
+
+    if os_name == "Windows":
+        # For Windows, FFmpeg uses 'dshow' as the input device.
+        # device_info should contain the 'name' of the device as recognized by FFmpeg.
+        device_name = device_info.get("name", "default")
+        # Example device name: "Microphone (Realtek High Definition Audio)"
+        cmd = [
+            "ffmpeg",
+            "-f", "dshow",
+            "-i", f"audio={device_name}",
+            "-ar", str(TARGET_RATE),
+            "-ac", str(CHANNELS),
+            "-f", FORMAT,
+            "pipe:1"
+        ]
+    elif os_name == "Darwin":
+        # For macOS, FFmpeg uses 'avfoundation'.
+        # device_info should contain the 'device_index' for audio.
+        device_index = device_info.get("device_index", "0")
+        # Example device index: "0" for default
+        cmd = [
+            "ffmpeg",
+            "-f", "avfoundation",
+            "-i", f":{device_index}",
+            "-ar", str(TARGET_RATE),
+            "-ac", str(CHANNELS),
+            "-f", FORMAT,
+            "pipe:1"
+        ]
+    elif os_name == "Linux":
+        # For Linux, FFmpeg uses 'alsa'.
+        # device_info should contain the 'device_name' as recognized by FFmpeg.
+        device_name = device_info.get("name", "default")
+        # Example device name: "default" or "hw:1,0"
+        cmd = [
+            "ffmpeg",
+            "-f", "alsa",
+            "-i", device_name,
+            "-ar", str(TARGET_RATE),
+            "-ac", str(CHANNELS),
+            "-f", FORMAT,
+            "pipe:1"
+        ]
+    else:
+        raise ValueError(f"Unsupported OS: {os_name}")
+
+    return cmd
+
+
+class SpeechToTextClient:
+    """
+    A client that:
+    - Uses FFmpeg to capture and process audio
+    - Initializes a WebSocket connection
+    - Streams raw 16-bit PCM over the WebSocket
+    - Displays transcriptions from the server in the GUI
+    """
+
+    def __init__(self, gui):
+        """
+        :param gui: An instance of the SpeechToTextGUI class for UI callbacks
+        """
+        self.gui = gui
+        self.ws = None
+        self.ffmpeg_process = None
+        self.streaming_thread = None
+        self.running = False
+
+        # Ask user to pick a device
+        mic_info = get_microphone()  # Should return a dict with necessary device info
+        self.device_info = mic_info
+
+        # Prepare the FFmpeg command
+        self.ffmpeg_cmd = get_ffmpeg_command(self.device_info)
+
+    def start_recording(self):
+        """Starts FFmpeg, initializes the WebSocket connection, and begins streaming audio."""
+        if self.running:
+            print("Already recording.")
+            return
+
+        self.running = True
+
+        # 1) Start FFmpeg subprocess
+        try:
+            self.ffmpeg_process = subprocess.Popen(
+                self.ffmpeg_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.DEVNULL,  # Suppress FFmpeg stderr; remove if debugging
+                bufsize=10 ** 8
+            )
+            print("FFmpeg started.")
+        except Exception as e:
+            print(f"Failed to start FFmpeg: {e}")
+            self.running = False
+            return
+
+        # 2) Initialize the WebSocket connection
+        self.ws = websocket.WebSocketApp(
+            SERVER_URL,
+            on_message=self.on_message,
+            on_error=self.on_error,
+            on_close=self.on_close
+        )
+        # Run WebSocket in a background thread
+        ws_thread = threading.Thread(target=self.ws.run_forever, daemon=True)
+        ws_thread.start()
+        print("WebSocket connection initiated.")
+
+        # 3) Start audio streaming loop in a separate thread
+        self.streaming_thread = threading.Thread(target=self.audio_stream, daemon=True)
+        self.streaming_thread.start()
+
+        self.gui.update_status("Recording started...")
+
+    def stop_recording(self):
+        """Stops audio streaming, terminates FFmpeg, and closes the WebSocket."""
+        if not self.running:
+            print("Not currently recording.")
+            return
+
+        self.running = False
+
+        # 1) Terminate FFmpeg subprocess
+        if self.ffmpeg_process:
+            self.ffmpeg_process.terminate()
+            self.ffmpeg_process = None
+            print("FFmpeg terminated.")
+
+        # 2) Close WebSocket connection
+        if self.ws:
+            self.ws.close()
+            self.ws = None
+            print("WebSocket connection closed.")
+
+        self.gui.update_status("Recording stopped...")
+
+    def audio_stream(self):
+        """
+        Continuously reads audio data from FFmpeg's stdout and sends it over WebSocket.
+        """
+        try:
+            while self.running:
+                # Read a chunk of data
+                data = self.ffmpeg_process.stdout.read(4096)  # Adjust chunk size as needed
+                if not data:
+                    print("No more data from FFmpeg.")
+                    break
+
+                # Send audio frames over WebSocket (binary)
+                if self.ws and self.ws.sock and self.ws.sock.connected:
+                    try:
+                        self.ws.send(data, opcode=websocket.ABNF.OPCODE_BINARY)
+                    except Exception as e:
+                        print(f"Error sending data over WebSocket: {e}")
+                        break
+                else:
+                    print("WebSocket is not connected.")
+                    break
+
+        except Exception as e:
+            print(f"Error during audio streaming: {e}")
+        finally:
+            self.running = False
+            self.stop_recording()
+
+    # ---------------------
+    # WebSocket Callbacks
+    # ---------------------
+    def on_message(self, ws, message):
+        """Handle transcriptions (or other messages) from the server."""
+        print("Received from server:", message)
+        try:
+            data = json.loads(message)
+            transcription = data.get("text", "")
+            if transcription:
+                self.gui.display_transcription(transcription)
+        except json.JSONDecodeError:
+            print("Error: Received invalid JSON:", message)
+
+    def on_error(self, ws, error):
+        """Handle any WebSocket errors."""
+        print("WebSocket Error:", error)
+
+    def on_close(self, ws, close_status_code, close_msg):
+        """Called when the WebSocket connection is closed."""
+        print("WebSocket Closed")
+
+
+class SpeechToTextGUI:
+    """
+    The GUI class for user interaction:
+    - Start/Stop buttons
+    - Status updates
+    - Displays transcriptions
+    - Ties everything together with SpeechToTextClient
+    """
+
+    def __init__(self):
+        self.client = SpeechToTextClient(self)
+
+        # Main window setup
+        self.root = Tk()
+        self.root.title("Speech-to-Text Client")
+
+        # Status label
+        self.status_label = Label(self.root, text="Click 'Start Recording' to begin.", anchor="w")
+        self.status_label.pack(fill="x", padx=10, pady=5)
+
+        # Text area for transcriptions
+        self.text_display = Text(self.root, wrap="word", height=20)
+        self.text_display.pack(fill="both", expand=True, padx=10, pady=5)
+
+        # Scrollbar for transcription area
+        scrollbar = Scrollbar(self.text_display)
+        scrollbar.pack(side="right", fill="y")
+        self.text_display.config(yscrollcommand=scrollbar.set)
+        scrollbar.config(command=self.text_display.yview)
+
+        # Start/Stop Buttons
+        start_button = Button(
+            self.root,
+            text="Start Recording",
+            command=self.client.start_recording,
+            bg="green",
+            fg="white"
+        )
+        start_button.pack(side="left", padx=10, pady=10)
+
+        stop_button = Button(
+            self.root,
+            text="Stop Recording",
+            command=self.client.stop_recording,
+            bg="red",
+            fg="white"
+        )
+        stop_button.pack(side="right", padx=10, pady=10)
+
+        # Handle window close event to ensure subprocesses are terminated
+        self.root.protocol("WM_DELETE_WINDOW", self.on_close)
+
+    def update_status(self, message):
+        """Updates the status label."""
+        self.status_label.config(text=message)
+
+    def display_transcription(self, transcription):
+        """Appends transcriptions to the text box and scrolls to the end."""
+        if transcription:
+            self.text_display.insert(END, transcription + "\n")
+            self.text_display.see(END)  # Auto-scroll
+
+    def on_close(self):
+        """Handle the window close event."""
+        self.client.stop_recording()
+        self.root.destroy()
+
+    def run(self):
+        """Start the Tkinter event loop."""
+        self.root.mainloop()
+
+
+if __name__ == "__main__":
+    gui = SpeechToTextGUI()
+    gui.run()
Add a comment
List