Code - yjyoon/whisper

<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Real-Time Audio Transcription with VAD and Volume Meter</title> <style> /* Your existing CSS styles */ body { --indicator-color: black; background: radial-gradient(black 55%, var(--indicator-color)); min-height: 100vh; color: white; margin: 0; font-family: Arial, sans-serif; } h1 { text-align: center; margin-top: 20px; } #controls { text-align: center; margin: 20px; } #toggle_vad_button { padding: 10px 20px; font-size: 16px; border: none; border-radius: 5px; background-color: #28a745; /* Green */ color: white; cursor: pointer; transition: background-color 0.3s ease; } #toggle_vad_button.recording { background-color: #dc3545; /* Red */ } #indicator { text-align: center; margin: 10px; font-size: 18px; } #playlist { max-width: 800px; margin: 0 auto; padding: 20px; background-color: rgba(255, 255, 255, 0.1); border-radius: 8px; height: 400px; overflow-y: scroll; list-style: none; padding-left: 0; } #playlist li { margin-bottom: 10px; opacity: 0; animation: fadeIn 1s forwards; } #playlist li.newItem { border-left: 4px solid #28a745; padding-left: 10px; } .transcription { color: white; font-size: 16px; } .notice { color: #dc3545; /* Red */ font-style: italic; } @keyframes fadeIn { to { opacity: 1; } } </style> </head> <body> <h1>Real-Time Audio Transcription with VAD and Volume Meter</h1> <div id="controls"> <button id="toggle_vad_button" onclick="window.toggleVAD()" disabled>START VAD</button> </div> <div id="indicator">VAD is <span style="color: red">LOADING</span></div> <ol id="playlist" reversed></ol>  <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"></script>  <script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script> <script type="module"> import { interpolateInferno } from "https://cdn.skypack.dev/d3-scale-chromatic@3"; // Elements const recordBtn = document.getElementById('toggle_vad_button'); const transcriptionsDiv = document.getElementById('playlist'); const indicator = document.getElementById('indicator'); // State Variables let isRecording = false; let vadInstance = null; let audioContext = null; let analyser = null; let microphoneStream = null; let dataArray = null; let animationId = null; let isSpeaking = false; let audioBuffer = []; let sendAudioInterval = null; const SEND_INTERVAL_MS = 1000; // 1 second let ws = null; let incomingBuffer = ''; let fullTranscription = ''; // To accumulate full transcription // Configuration const WS_ENDPOINT = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"; // Ensure this is correct const BACKEND_UPLOAD_URL = "http://localhost:3000/upload-audio"; // Replace with your backend URL const BACKEND_LLM_URL = "http://localhost:3000/process-transcription"; // Replace with your backend URL // Utility Functions /** * Logs transcription text with colored words based on probability. * @param {Array} words - Array of word objects with 'word' and 'probability'. */ function logTranscription(words) { const transcriptionLine = document.createElement('div'); transcriptionLine.classList.add('transcription'); words.forEach(wordObj => { const span = document.createElement('span'); span.textContent = wordObj.word + ' '; // Add space after each word // Calculate hue: 0 (red) to 240 (blue) const hue = wordObj.probability * 240; span.style.color = `hsl(${hue}, 100%, 50%)`; transcriptionLine.appendChild(span); fullTranscription += wordObj.word + ' '; }); transcriptionsDiv.prepend(transcriptionLine); transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight; } /** * Logs notice messages (e.g., connection status, errors). * @param {string} text - The notice text to display. */ function logNotice(text) { const p = document.createElement('p'); p.classList.add('notice'); p.textContent = text; transcriptionsDiv.prepend(p); transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight; } /** * Converts Float32 audio data to Int16 PCM format. * @param {Float32Array} buffer - The audio buffer in Float32 format. * @returns {Int16Array} - The audio buffer in Int16 format. */ function convertFloat32ToInt16(buffer) { let l = buffer.length; const buf = new Int16Array(l); while (l--) { buf[l] = Math.min(1, buffer[l]) * 0x7FFF; } return buf; } /** * Extracts JSON objects from a concatenated string. * @param {string} buffer - The concatenated JSON string. * @returns {Array} - An array of parsed JSON objects. */ function extractJSONObjects(buffer) { const objects = []; let braceStack = 0; let inString = false; let escape = false; let lastSplit = 0; for (let i = 0; i < buffer.length; i++) { const char = buffer[i]; if (char === '"' && !escape) { inString = !inString; } if (!inString) { if (char === '{') { braceStack++; } else if (char === '}') { braceStack--; if (braceStack === 0) { const jsonString = buffer.slice(lastSplit, i + 1); try { const jsonObj = JSON.parse(jsonString); objects.push(jsonObj); } catch (e) { console.error('Failed to parse JSON:', e); } lastSplit = i + 1; } } } // Handle escape characters if (char === '\\' && !escape) { escape = true; } else { escape = false; } } // Return any remaining buffer that wasn't parsed incomingBuffer = buffer.slice(lastSplit); return objects; } // WebSocket Handlers /** * Sets up the WebSocket connection and defines event handlers. */ function setupWebSocket() { ws = new WebSocket(WS_ENDPOINT); ws.binaryType = 'arraybuffer'; ws.onopen = () => { console.log('WebSocket connection opened.'); logNotice("WebSocket connection established."); }; ws.onmessage = (event) => { let messageData = ''; if (typeof event.data === 'string') { messageData = event.data; } else if (event.data instanceof ArrayBuffer) { const decoder = new TextDecoder('utf-8'); messageData = decoder.decode(event.data); } else { console.warn('Unsupported message format:', event.data); return; } // Append incoming data to buffer incomingBuffer += messageData; // Extract JSON objects const jsonObjects = extractJSONObjects(incomingBuffer); // Process each JSON object jsonObjects.forEach(obj => { if (obj.task === "transcribe" && Array.isArray(obj.words)) { logTranscription(obj.words); } }); }; ws.onclose = (event) => { console.log('WebSocket connection closed:', event); logNotice("WebSocket connection closed."); ws = null; if (isRecording && shouldReconnect) { logNotice("Attempting to reconnect..."); setTimeout(() => { setupWebSocket(); }, reconnectInterval); } else if (isRecording) { logNotice("Transcription session ended."); stopRecording(true); // true indicates server-initiated stop } }; ws.onerror = (error) => { console.error('WebSocket error:', error); logNotice("WebSocket encountered an error."); }; } // Voice Activity Detection Setup /** * Initializes the Voice Activity Detector (VAD) using MicVAD. */ async function initializeVAD(stream) { try { vadInstance = await vad.MicVAD.new({ stream: stream, // Pass the existing MediaStream to avoid multiple microphone accesses onSpeechStart: () => { console.log("Speech start detected"); logNotice("Speech detected..."); isSpeaking = true; audioBuffer = []; // Reset buffer // Start timer to send audio every second sendAudioInterval = setInterval(sendAudio, SEND_INTERVAL_MS); }, onSpeechEnd: (audio) => { console.log("Speech end detected"); logNotice("Sending final speech segment to server..."); isSpeaking = false; // Send any remaining audio sendAudio(); // Stop the timer if (sendAudioInterval) { clearInterval(sendAudioInterval); sendAudioInterval = null; } // Optionally, send the final `audio` provided by the callback // depending on your application's needs // Example: // sendFinalAudio(audio); }, onFrameProcessed: (probabilities, frame) => { const indicatorColor = interpolateInferno(probabilities.isSpeech / 2); document.body.style.setProperty("--indicator-color", indicatorColor); if (isSpeaking) { audioBuffer.push(frame); } }, }); window.vadInstance = vadInstance; // Start VAD listening vadInstance.start(); isRecording = true; recordBtn.textContent = 'STOP VAD'; recordBtn.classList.add('recording'); logNotice("Recording started. Speak into your microphone."); } catch (error) { console.error('Error initializing VAD:', error); logNotice("Error initializing Voice Activity Detection."); } } // Volume Meter Setup (Optional, based on your requirements) /** * Sets up the volume meter using the Web Audio API. */ async function setupVolumeMeter(stream) { try { // Initialize AudioContext audioContext = new (window.AudioContext || window.webkitAudioContext)(); // Create MediaStreamSource from the existing stream microphoneStream = audioContext.createMediaStreamSource(stream); // Create AnalyserNode analyser = audioContext.createAnalyser(); analyser.fftSize = 512; const bufferLength = analyser.frequencyBinCount; dataArray = new Uint8Array(bufferLength); // Connect microphone to analyser microphoneStream.connect(analyser); // Start visualizing visualize(); } catch (error) { console.error('Error setting up volume meter:', error); logNotice("Error setting up volume meter."); } } /** * Visualizes the volume level on the volume meter. */ function visualize() { const updateVolume = () => { analyser.getByteFrequencyData(dataArray); let sum = 0; for (let i = 0; i < dataArray.length; i++) { sum += dataArray[i]; } const average = sum / dataArray.length; const volume = average / 255; // Normalize to [0,1] // Update the volume bar width volumeBar.style.width = `${volume * 100}%`; // Change color based on volume level (green to red) const hue = (1 - volume) * 120; // 120 (green) to 0 (red) volumeBar.style.backgroundColor = `hsl(${hue}, 100%, 50%)`; animationId = requestAnimationFrame(updateVolume); }; updateVolume(); } /** * Stops the volume meter visualization. */ function stopVolumeMeter() { if (animationId) { cancelAnimationFrame(animationId); animationId = null; } if (volumeBar) { volumeBar.style.width = '0%'; volumeBar.style.backgroundColor = '#28a745'; // Reset to green } if (analyser) { analyser.disconnect(); analyser = null; } if (microphoneStream) { microphoneStream.disconnect(); microphoneStream = null; } if (audioContext) { audioContext.close(); audioContext = null; } } // LLM Integration /** * Sends the transcription to the backend server for LLM processing. * @param {string} transcription - The transcribed text. */ async function sendTranscriptionToLLM(transcription) { try { const response = await fetch(BACKEND_LLM_URL, { // Adjust the URL if your server is hosted elsewhere method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ transcription }), }); if (!response.ok) { throw new Error(`Server error: ${response.status}`); } const data = await response.json(); if (data.llmResponse) { displayLLMResponse(data.llmResponse); } } catch (error) { console.error('Error sending transcription to LLM:', error); logNotice("Error processing transcription with LLM."); } } /** * Displays the LLM's response in the transcriptions div. * @param {object} llmResponse - The response from the LLM. */ function displayLLMResponse(llmResponse) { // Adjust based on your LLM's response structure const responseText = llmResponse.choices && llmResponse.choices[0] && llmResponse.choices[0].message && llmResponse.choices[0].message.content ? llmResponse.choices[0].message.content : 'No response from LLM.'; const responseLine = document.createElement('div'); responseLine.classList.add('transcription'); const span = document.createElement('span'); span.textContent = `LLM Response: ${responseText}`; span.style.color = `hsl(200, 100%, 50%)`; // Example color responseLine.appendChild(span); transcriptionsDiv.prepend(responseLine); transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight; } /** * Sends the accumulated audio to the server. */ async function sendAudioToServer(audioBuffer) { try { const response = await fetch(BACKEND_UPLOAD_URL, { // Replace with your backend URL method: 'POST', headers: { 'Content-Type': 'application/octet-stream', // Adjust based on server expectations }, body: audioBuffer, }); if (!response.ok) { throw new Error(`Server responded with status ${response.status}`); } console.log('Audio sent successfully'); } catch (error) { console.error('Error sending audio:', error); logNotice("Error sending audio to server."); } } /** * Adds an audio element to the playlist. * @param {string} audioUrl - The data URL of the audio. * @returns {HTMLElement} - The created list item element. */ function addAudio(audioUrl) { const entry = document.createElement("li"); const audio = document.createElement("audio"); audio.controls = true; audio.src = audioUrl; entry.classList.add("newItem"); entry.appendChild(audio); return entry; } // Recording Control Functions /** * Starts the Voice Activity Detection, Volume Meter, and WebSocket connection. */ async function startRecording() { try { // Request microphone access once const stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false }); // Optionally, set up Volume Meter // await setupVolumeMeter(stream); // Initialize VAD with the same stream await initializeVAD(stream); // Set up WebSocket setupWebSocket(); } catch (error) { console.error('Error starting recording:', error); logNotice("Error starting recording. Please try again."); } } /** * Stops the Voice Activity Detection, Volume Meter, and cleans up resources. * @param {boolean} serverInitiated - Indicates if the stop was triggered by the server. */ function stopRecording(serverInitiated = false) { if (!isRecording) return; // Stop VAD if (vadInstance) { if (typeof vadInstance.pause === 'function') { vadInstance.pause(); } else { console.warn('VAD instance does not have a pause method.'); } vadInstance = null; } // Optionally, stop Volume Meter // stopVolumeMeter(); // Prevent reconnection if stopping manually if (!serverInitiated) { shouldReconnect = false; } // Close WebSocket if not server-initiated if (!serverInitiated && ws && ws.readyState === WebSocket.OPEN) { ws.send(JSON.stringify({ action: "terminate" })); logNotice("Termination signal sent to server."); } // Close WebSocket if (ws) { ws.close(); ws = null; } // Reset recording state isRecording = false; recordBtn.textContent = 'START VAD'; recordBtn.classList.remove('recording'); logNotice("Recording stopped."); // Send the full transcription to the LLM if (fullTranscription.trim().length > 0) { sendTranscriptionToLLM(fullTranscription.trim()); fullTranscription = ''; // Reset after sending } } /** * Sends the accumulated audio to the server periodically. */ async function sendAudio() { if (audioBuffer.length === 0) return; // Concatenate all frames into a single Float32Array const totalLength = audioBuffer.reduce((sum, frame) => sum + frame.length, 0); const concatenated = new Float32Array(totalLength); let offset = 0; audioBuffer.forEach(frame => { concatenated.set(frame, offset); offset += frame.length; }); // Encode to WAV format const wavBuffer = vad.utils.encodeWAV(concatenated); // Send the audio to the server await sendAudioToServer(wavBuffer); // Optionally, add the audio to the UI const base64 = vad.utils.arrayBufferToBase64(wavBuffer); const audioUrl = `data:audio/wav;base64,${base64}`; const audioElement = addAudio(audioUrl); transcriptionsDiv.prepend(audioElement); // Reset the buffer audioBuffer = []; } // Button Event Listener /** * Toggles recording state when the record button is clicked. */ window.toggleVAD = () => { console.log("ran toggle vad"); if (!isRecording) { startRecording().catch(error => { console.error('Error starting recording:', error); logNotice("Error starting recording. Please try again."); }); } else { stopRecording(); } }; </script> </body> </html>