
File name
Commit message
Commit date
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Real-Time Audio Transcription with VAD and Volume Meter</title>
<style>
/* Your existing CSS styles */
body {
--indicator-color: black;
background: radial-gradient(black 55%, var(--indicator-color));
min-height: 100vh;
color: white;
margin: 0;
font-family: Arial, sans-serif;
}
h1 {
text-align: center;
margin-top: 20px;
}
#controls {
text-align: center;
margin: 20px;
}
#toggle_vad_button {
padding: 10px 20px;
font-size: 16px;
border: none;
border-radius: 5px;
background-color: #28a745; /* Green */
color: white;
cursor: pointer;
transition: background-color 0.3s ease;
}
#toggle_vad_button.recording {
background-color: #dc3545; /* Red */
}
#indicator {
text-align: center;
margin: 10px;
font-size: 18px;
}
#playlist {
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: rgba(255, 255, 255, 0.1);
border-radius: 8px;
height: 400px;
overflow-y: scroll;
list-style: none;
padding-left: 0;
}
#playlist li {
margin-bottom: 10px;
opacity: 0;
animation: fadeIn 1s forwards;
}
#playlist li.newItem {
border-left: 4px solid #28a745;
padding-left: 10px;
}
.transcription {
color: white;
font-size: 16px;
}
.notice {
color: #dc3545; /* Red */
font-style: italic;
}
@keyframes fadeIn {
to {
opacity: 1;
}
}
</style>
</head>
<body>
<h1>Real-Time Audio Transcription with VAD and Volume Meter</h1>
<div id="controls">
<button id="toggle_vad_button" onclick="window.toggleVAD()" disabled>START VAD</button>
</div>
<div id="indicator">VAD is <span style="color: red">LOADING</span></div>
<ol id="playlist" reversed></ol>
<!-- Include ONNX Runtime Web -->
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"></script>
<!-- Include VAD-Web -->
<script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"></script>
<script type="module">
import { interpolateInferno } from "https://cdn.skypack.dev/d3-scale-chromatic@3";
// Elements
const recordBtn = document.getElementById('toggle_vad_button');
const transcriptionsDiv = document.getElementById('playlist');
const indicator = document.getElementById('indicator');
// State Variables
let isRecording = false;
let vadInstance = null;
let audioContext = null;
let analyser = null;
let microphoneStream = null;
let dataArray = null;
let animationId = null;
let isSpeaking = false;
let audioBuffer = [];
let sendAudioInterval = null;
const SEND_INTERVAL_MS = 1000; // 1 second
let ws = null;
let incomingBuffer = '';
let fullTranscription = ''; // To accumulate full transcription
// Configuration
const WS_ENDPOINT = "ws://takensofttesting.iptime.org:54127/v1/audio/transcriptions?language=ko"; // Ensure this is correct
const BACKEND_UPLOAD_URL = "http://localhost:3000/upload-audio"; // Replace with your backend URL
const BACKEND_LLM_URL = "http://localhost:3000/process-transcription"; // Replace with your backend URL
// Utility Functions
/**
* Logs transcription text with colored words based on probability.
* @param {Array} words - Array of word objects with 'word' and 'probability'.
*/
function logTranscription(words) {
const transcriptionLine = document.createElement('div');
transcriptionLine.classList.add('transcription');
words.forEach(wordObj => {
const span = document.createElement('span');
span.textContent = wordObj.word + ' '; // Add space after each word
// Calculate hue: 0 (red) to 240 (blue)
const hue = wordObj.probability * 240;
span.style.color = `hsl(${hue}, 100%, 50%)`;
transcriptionLine.appendChild(span);
fullTranscription += wordObj.word + ' ';
});
transcriptionsDiv.prepend(transcriptionLine);
transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
}
/**
* Logs notice messages (e.g., connection status, errors).
* @param {string} text - The notice text to display.
*/
function logNotice(text) {
const p = document.createElement('p');
p.classList.add('notice');
p.textContent = text;
transcriptionsDiv.prepend(p);
transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
}
/**
* Converts Float32 audio data to Int16 PCM format.
* @param {Float32Array} buffer - The audio buffer in Float32 format.
* @returns {Int16Array} - The audio buffer in Int16 format.
*/
function convertFloat32ToInt16(buffer) {
let l = buffer.length;
const buf = new Int16Array(l);
while (l--) {
buf[l] = Math.min(1, buffer[l]) * 0x7FFF;
}
return buf;
}
/**
* Extracts JSON objects from a concatenated string.
* @param {string} buffer - The concatenated JSON string.
* @returns {Array} - An array of parsed JSON objects.
*/
function extractJSONObjects(buffer) {
const objects = [];
let braceStack = 0;
let inString = false;
let escape = false;
let lastSplit = 0;
for (let i = 0; i < buffer.length; i++) {
const char = buffer[i];
if (char === '"' && !escape) {
inString = !inString;
}
if (!inString) {
if (char === '{') {
braceStack++;
} else if (char === '}') {
braceStack--;
if (braceStack === 0) {
const jsonString = buffer.slice(lastSplit, i + 1);
try {
const jsonObj = JSON.parse(jsonString);
objects.push(jsonObj);
} catch (e) {
console.error('Failed to parse JSON:', e);
}
lastSplit = i + 1;
}
}
}
// Handle escape characters
if (char === '\\' && !escape) {
escape = true;
} else {
escape = false;
}
}
// Return any remaining buffer that wasn't parsed
incomingBuffer = buffer.slice(lastSplit);
return objects;
}
// WebSocket Handlers
/**
* Sets up the WebSocket connection and defines event handlers.
*/
function setupWebSocket() {
ws = new WebSocket(WS_ENDPOINT);
ws.binaryType = 'arraybuffer';
ws.onopen = () => {
console.log('WebSocket connection opened.');
logNotice("WebSocket connection established.");
};
ws.onmessage = (event) => {
let messageData = '';
if (typeof event.data === 'string') {
messageData = event.data;
} else if (event.data instanceof ArrayBuffer) {
const decoder = new TextDecoder('utf-8');
messageData = decoder.decode(event.data);
} else {
console.warn('Unsupported message format:', event.data);
return;
}
// Append incoming data to buffer
incomingBuffer += messageData;
// Extract JSON objects
const jsonObjects = extractJSONObjects(incomingBuffer);
// Process each JSON object
jsonObjects.forEach(obj => {
if (obj.task === "transcribe" && Array.isArray(obj.words)) {
logTranscription(obj.words);
}
});
};
ws.onclose = (event) => {
console.log('WebSocket connection closed:', event);
logNotice("WebSocket connection closed.");
ws = null;
if (isRecording && shouldReconnect) {
logNotice("Attempting to reconnect...");
setTimeout(() => {
setupWebSocket();
}, reconnectInterval);
} else if (isRecording) {
logNotice("Transcription session ended.");
stopRecording(true); // true indicates server-initiated stop
}
};
ws.onerror = (error) => {
console.error('WebSocket error:', error);
logNotice("WebSocket encountered an error.");
};
}
// Voice Activity Detection Setup
/**
* Initializes the Voice Activity Detector (VAD) using MicVAD.
*/
async function initializeVAD(stream) {
try {
vadInstance = await vad.MicVAD.new({
stream: stream, // Pass the existing MediaStream to avoid multiple microphone accesses
onSpeechStart: () => {
console.log("Speech start detected");
logNotice("Speech detected...");
isSpeaking = true;
audioBuffer = []; // Reset buffer
// Start timer to send audio every second
sendAudioInterval = setInterval(sendAudio, SEND_INTERVAL_MS);
},
onSpeechEnd: (audio) => {
console.log("Speech end detected");
logNotice("Sending final speech segment to server...");
isSpeaking = false;
// Send any remaining audio
sendAudio();
// Stop the timer
if (sendAudioInterval) {
clearInterval(sendAudioInterval);
sendAudioInterval = null;
}
// Optionally, send the final `audio` provided by the callback
// depending on your application's needs
// Example:
// sendFinalAudio(audio);
},
onFrameProcessed: (probabilities, frame) => {
const indicatorColor = interpolateInferno(probabilities.isSpeech / 2);
document.body.style.setProperty("--indicator-color", indicatorColor);
if (isSpeaking) {
audioBuffer.push(frame);
}
},
});
window.vadInstance = vadInstance;
// Start VAD listening
vadInstance.start();
isRecording = true;
recordBtn.textContent = 'STOP VAD';
recordBtn.classList.add('recording');
logNotice("Recording started. Speak into your microphone.");
} catch (error) {
console.error('Error initializing VAD:', error);
logNotice("Error initializing Voice Activity Detection.");
}
}
// Volume Meter Setup (Optional, based on your requirements)
/**
* Sets up the volume meter using the Web Audio API.
*/
async function setupVolumeMeter(stream) {
try {
// Initialize AudioContext
audioContext = new (window.AudioContext || window.webkitAudioContext)();
// Create MediaStreamSource from the existing stream
microphoneStream = audioContext.createMediaStreamSource(stream);
// Create AnalyserNode
analyser = audioContext.createAnalyser();
analyser.fftSize = 512;
const bufferLength = analyser.frequencyBinCount;
dataArray = new Uint8Array(bufferLength);
// Connect microphone to analyser
microphoneStream.connect(analyser);
// Start visualizing
visualize();
} catch (error) {
console.error('Error setting up volume meter:', error);
logNotice("Error setting up volume meter.");
}
}
/**
* Visualizes the volume level on the volume meter.
*/
function visualize() {
const updateVolume = () => {
analyser.getByteFrequencyData(dataArray);
let sum = 0;
for (let i = 0; i < dataArray.length; i++) {
sum += dataArray[i];
}
const average = sum / dataArray.length;
const volume = average / 255; // Normalize to [0,1]
// Update the volume bar width
volumeBar.style.width = `${volume * 100}%`;
// Change color based on volume level (green to red)
const hue = (1 - volume) * 120; // 120 (green) to 0 (red)
volumeBar.style.backgroundColor = `hsl(${hue}, 100%, 50%)`;
animationId = requestAnimationFrame(updateVolume);
};
updateVolume();
}
/**
* Stops the volume meter visualization.
*/
function stopVolumeMeter() {
if (animationId) {
cancelAnimationFrame(animationId);
animationId = null;
}
if (volumeBar) {
volumeBar.style.width = '0%';
volumeBar.style.backgroundColor = '#28a745'; // Reset to green
}
if (analyser) {
analyser.disconnect();
analyser = null;
}
if (microphoneStream) {
microphoneStream.disconnect();
microphoneStream = null;
}
if (audioContext) {
audioContext.close();
audioContext = null;
}
}
// LLM Integration
/**
* Sends the transcription to the backend server for LLM processing.
* @param {string} transcription - The transcribed text.
*/
async function sendTranscriptionToLLM(transcription) {
try {
const response = await fetch(BACKEND_LLM_URL, { // Adjust the URL if your server is hosted elsewhere
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ transcription }),
});
if (!response.ok) {
throw new Error(`Server error: ${response.status}`);
}
const data = await response.json();
if (data.llmResponse) {
displayLLMResponse(data.llmResponse);
}
} catch (error) {
console.error('Error sending transcription to LLM:', error);
logNotice("Error processing transcription with LLM.");
}
}
/**
* Displays the LLM's response in the transcriptions div.
* @param {object} llmResponse - The response from the LLM.
*/
function displayLLMResponse(llmResponse) {
// Adjust based on your LLM's response structure
const responseText = llmResponse.choices && llmResponse.choices[0] && llmResponse.choices[0].message && llmResponse.choices[0].message.content
? llmResponse.choices[0].message.content
: 'No response from LLM.';
const responseLine = document.createElement('div');
responseLine.classList.add('transcription');
const span = document.createElement('span');
span.textContent = `LLM Response: ${responseText}`;
span.style.color = `hsl(200, 100%, 50%)`; // Example color
responseLine.appendChild(span);
transcriptionsDiv.prepend(responseLine);
transcriptionsDiv.scrollTop = transcriptionsDiv.scrollHeight;
}
/**
* Sends the accumulated audio to the server.
*/
async function sendAudioToServer(audioBuffer) {
try {
const response = await fetch(BACKEND_UPLOAD_URL, { // Replace with your backend URL
method: 'POST',
headers: {
'Content-Type': 'application/octet-stream', // Adjust based on server expectations
},
body: audioBuffer,
});
if (!response.ok) {
throw new Error(`Server responded with status ${response.status}`);
}
console.log('Audio sent successfully');
} catch (error) {
console.error('Error sending audio:', error);
logNotice("Error sending audio to server.");
}
}
/**
* Adds an audio element to the playlist.
* @param {string} audioUrl - The data URL of the audio.
* @returns {HTMLElement} - The created list item element.
*/
function addAudio(audioUrl) {
const entry = document.createElement("li");
const audio = document.createElement("audio");
audio.controls = true;
audio.src = audioUrl;
entry.classList.add("newItem");
entry.appendChild(audio);
return entry;
}
// Recording Control Functions
/**
* Starts the Voice Activity Detection, Volume Meter, and WebSocket connection.
*/
async function startRecording() {
try {
// Request microphone access once
const stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
// Optionally, set up Volume Meter
// await setupVolumeMeter(stream);
// Initialize VAD with the same stream
await initializeVAD(stream);
// Set up WebSocket
setupWebSocket();
} catch (error) {
console.error('Error starting recording:', error);
logNotice("Error starting recording. Please try again.");
}
}
/**
* Stops the Voice Activity Detection, Volume Meter, and cleans up resources.
* @param {boolean} serverInitiated - Indicates if the stop was triggered by the server.
*/
function stopRecording(serverInitiated = false) {
if (!isRecording) return;
// Stop VAD
if (vadInstance) {
if (typeof vadInstance.pause === 'function') {
vadInstance.pause();
} else {
console.warn('VAD instance does not have a pause method.');
}
vadInstance = null;
}
// Optionally, stop Volume Meter
// stopVolumeMeter();
// Prevent reconnection if stopping manually
if (!serverInitiated) {
shouldReconnect = false;
}
// Close WebSocket if not server-initiated
if (!serverInitiated && ws && ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ action: "terminate" }));
logNotice("Termination signal sent to server.");
}
// Close WebSocket
if (ws) {
ws.close();
ws = null;
}
// Reset recording state
isRecording = false;
recordBtn.textContent = 'START VAD';
recordBtn.classList.remove('recording');
logNotice("Recording stopped.");
// Send the full transcription to the LLM
if (fullTranscription.trim().length > 0) {
sendTranscriptionToLLM(fullTranscription.trim());
fullTranscription = ''; // Reset after sending
}
}
/**
* Sends the accumulated audio to the server periodically.
*/
async function sendAudio() {
if (audioBuffer.length === 0) return;
// Concatenate all frames into a single Float32Array
const totalLength = audioBuffer.reduce((sum, frame) => sum + frame.length, 0);
const concatenated = new Float32Array(totalLength);
let offset = 0;
audioBuffer.forEach(frame => {
concatenated.set(frame, offset);
offset += frame.length;
});
// Encode to WAV format
const wavBuffer = vad.utils.encodeWAV(concatenated);
// Send the audio to the server
await sendAudioToServer(wavBuffer);
// Optionally, add the audio to the UI
const base64 = vad.utils.arrayBufferToBase64(wavBuffer);
const audioUrl = `data:audio/wav;base64,${base64}`;
const audioElement = addAudio(audioUrl);
transcriptionsDiv.prepend(audioElement);
// Reset the buffer
audioBuffer = [];
}
// Button Event Listener
/**
* Toggles recording state when the record button is clicked.
*/
window.toggleVAD = () => {
console.log("ran toggle vad");
if (!isRecording) {
startRecording().catch(error => {
console.error('Error starting recording:', error);
logNotice("Error starting recording. Please try again.");
});
} else {
stopRecording();
}
};
</script>
</body>
</html>