Speech to text

Rate this post
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>High-Accuracy Transcription (Whisper Base)</title>
    <style>
        :root { --primary: #2563eb; --bg: #f8fafc; --text: #1e293b; }
        body { font-family: system-ui, -apple-system, sans-serif; background: var(--bg); color: var(--text); display: flex; justify-content: center; padding: 40px 20px; }
        .container { background: white; width: 100%; max-width: 850px; padding: 30px; border-radius: 16px; box-shadow: 0 10px 30px rgba(0,0,0,0.08); }
        h1 { margin: 0 0 10px 0; font-size: 24px; }
        .badge { display: inline-block; padding: 4px 12px; background: #dcfce7; color: #166534; border-radius: 20px; font-size: 12px; font-weight: bold; margin-bottom: 20px; }
        
        .upload-card { border: 2px dashed #cbd5e1; padding: 40px; border-radius: 12px; text-align: center; margin-bottom: 20px; cursor: pointer; transition: 0.2s; }
        .upload-card:hover { border-color: var(--primary); background: #eff6ff; }
        
        #output { width: 100%; height: 350px; margin-top: 20px; padding: 15px; border: 1px solid #e2e8f0; border-radius: 8px; font-size: 16px; line-height: 1.6; resize: vertical; box-sizing: border-box; }
        
        button { width: 100%; padding: 16px; background: var(--primary); color: white; border: none; border-radius: 8px; font-weight: 600; cursor: pointer; font-size: 16px; }
        button:disabled { background: #94a3b8; cursor: not-allowed; }
        
        .status-area { margin-top: 15px; }
        .status-text { font-size: 14px; color: #64748b; margin-bottom: 8px; display: block; }
        .progress-bar { width: 100%; height: 6px; background: #e2e8f0; border-radius: 10px; overflow: hidden; display: none; }
        #progressFill { width: 0%; height: 100%; background: var(--primary); transition: width 0.2s; }
    </style>
</head>
<body>

<div class="container">
    <div class="badge">WHISPER BASE - HIGH ACCURACY</div>
    <h1>Speech to Text</h1>
    <p style="color: #64748b; margin-top: -10px;">English Only (Local Model)</p>

    <div class="upload-card" onclick="document.getElementById('audioFile').click()">
        <input type="file" id="audioFile" accept="audio/*" style="display:none">
        <span id="fileName">Click to upload or drag audio file here</span>
    </div>

    <div class="status-area">
        <span id="statusLabel" class="status-text">Initializing AI...</span>
        <div class="progress-bar" id="progressContainer">
            <div id="progressFill"></div>
        </div>
    </div>

    <button id="btnTranscribe" disabled>Loading Model...</button>

    <textarea id="output" placeholder="High-accuracy text will appear here..."></textarea>
</div>

<script type="module">
    import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]/dist/transformers.min.js';

    // --- LOCAL MODEL SETUP ---
    env.allowRemoteModels = false; 
    env.allowLocalModels = true;
    env.localModelPath = './models/'; 
    // -------------------------

    let transcriber;
    const btn = document.getElementById('btnTranscribe');
    const statusLabel = document.getElementById('statusLabel');
    const progressFill = document.getElementById('progressFill');
    const progressContainer = document.getElementById('progressContainer');
    const output = document.getElementById('output');
    const fileInput = document.getElementById('audioFile');

    // Update file name on selection
    fileInput.onchange = () => {
        if (fileInput.files.length) document.getElementById('fileName').innerText = fileInput.files[0].name;
    };

    // 1. Initialize Whisper Base
    async function loadModel() {
        progressContainer.style.display = 'block';
        try {
            // Loading 'whisper-base.en' from local /models/ folder
            transcriber = await pipeline('automatic-speech-recognition', 'whisper-base.en', {
                device: 'webgpu', 
                dtype: 'q4', // Quantized for 4x faster loading and less RAM
            });
            
            statusLabel.innerText = "Model Loaded Successfully.";
            btn.innerText = "Start High-Accuracy Transcription";
            btn.disabled = false;
            progressContainer.style.display = 'none';
        } catch (err) {
            console.error(err);
            statusLabel.innerText = "Error: Check if files are in /models/whisper-base.en/";
        }
    }

    // 2. Transcription Logic
    btn.onclick = async () => {
        if (!fileInput.files.length) return alert("Select a file first.");

        btn.disabled = true;
        btn.innerText = "Processing...";
        statusLabel.innerText = "Decoding audio data...";
        output.value = "";

        try {
            const audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
            const arrayBuffer = await fileInput.files[0].arrayBuffer();
            const decoded = await audioCtx.decodeAudioData(arrayBuffer);
            const audioData = decoded.getChannelData(0);

            statusLabel.innerText = "AI is transcribing... (Processing in 30s chunks)";
            
            const start = performance.now();

            // Run AI (No 'language' or 'task' used because it's an .en model)
            const result = await transcriber(audioData, {
                chunk_length_s: 30,
                stride_length_s: 5,
                return_timestamps: false,
                // Optional: Force a bit more focus on accuracy
                num_beams: 1, 
            });

            const end = performance.now();
            output.value = result.text.trim();
            statusLabel.innerText = `Transcription finished in ${((end - start) / 1000).toFixed(1)}s`;

        } catch (err) {
            console.error(err);
            statusLabel.innerText = "Transcription Error: " + err.message;
        } finally {
            btn.disabled = false;
            btn.innerText = "Start High-Accuracy Transcription";
        }
    };

    loadModel();
</script>

</body>
</html>