当前位置：首页 > news >正文

JavaScript实时语音转录：Web端SenseVoice-Small集成方案

news 2026/3/27 0:05:52

JavaScript实时语音转录：Web端SenseVoice-Small集成方案

1. 引言

想象一下这样的场景：用户在你的网站上通过麦克风说话，文字实时出现在屏幕上，无需任何服务器交互，完全在浏览器中完成。这就是Web端实时语音转录的魅力所在。

SenseVoice-Small作为一个轻量级多语言语音识别模型，结合Web Audio API和现代JavaScript技术，为前端开发者提供了在浏览器中实现高质量语音转录的可能。无论是在线会议转录、语音笔记应用，还是无障碍访问功能，这种技术都能为用户带来前所未有的体验。

本文将带你一步步了解如何在Web前端集成SenseVoice-Small，实现真正意义上的实时语音转录。

2. 技术架构概述

2.1 核心组件

Web端语音转录涉及几个关键技术的协同工作：

Web Audio API- 负责从麦克风捕获音频流并进行初步处理Audio Worklet- 在后台线程中处理音频数据，避免阻塞主线程ONNX Runtime Web- 在浏览器中运行预训练的SenseVoice-Small模型Streaming处理- 实现真正的实时转录，而不是等待整个录音结束

2.2 工作流程

整个系统的工作流程可以概括为：麦克风输入 → 音频预处理 → 特征提取 → 模型推理 → 文本输出。每个环节都需要精细的优化才能达到实时性能。

3. Web Audio API实战

3.1 麦克风访问与配置

首先需要获取用户的麦克风访问权限：

async function setupMicrophone() { try { const stream = await navigator.mediaDevices.getUserMedia({ audio: { channelCount: 1, // 单声道 sampleRate: 16000, // 16kHz采样率 echoCancellation: true, // 回声消除 noiseSuppression: true // 噪声抑制 } }); return stream; } catch (error) { console.error('麦克风访问失败:', error); throw error; } }

3.2 音频数据处理

使用AudioWorklet在后台处理音频数据：

// audio-processor.js (AudioWorklet处理器) class AudioProcessor extends AudioWorkletProcessor { process(inputs, outputs, parameters) { const input = inputs[0]; if (input && input.length > 0) { const audioData = input[0]; // 发送到主线程进行进一步处理 this.port.postMessage(audioData); } return true; } } registerProcessor('audio-processor', AudioProcessor);

4. SenseVoice-Small集成

4.1 模型加载与初始化

使用ONNX Runtime Web加载预训练的SenseVoice-Small模型：

async function loadModel() { // 创建ONNX Runtime会话 const session = await ort.InferenceSession.create( './models/sensevoice-small.onnx', { executionProviders: ['webgl'], // 使用WebGL加速 graphOptimizationLevel: 'all' // 启用所有图优化 } ); // 预热模型 const warmupInput = new ort.Tensor('float32', new Float32Array(16000).fill(0), [1, 16000]); await session.run({ audio: warmupInput }); return session; }

4.2 实时推理优化

为了实现实时性能，需要优化推理过程：

class StreamingRecognizer { constructor(modelSession) { this.session = modelSession; this.buffer = new Float32Array(0); this.bufferSize = 16000; // 1秒的音频数据 } async processChunk(audioChunk) { // 将新数据添加到缓冲区 this.buffer = this.concatArrays(this.buffer, audioChunk); // 如果缓冲区有足够数据，进行推理 if (this.buffer.length >= this.bufferSize) { const inputTensor = new ort.Tensor('float32', this.buffer.slice(0, this.bufferSize), [1, this.bufferSize]); const results = await this.session.run({ audio: inputTensor }); const text = this.decodeOutput(results); // 保留未处理的数据 this.buffer = this.buffer.slice(this.bufferSize); return text; } return null; } concatArrays(a, b) { const result = new Float32Array(a.length + b.length); result.set(a); result.set(b, a.length); return result; } }

5. 流式处理与性能优化

5.1 双缓冲技术

使用双缓冲避免音频数据丢失：

class DoubleBuffer { constructor() { this.frontBuffer = new Float32Array(0); this.backBuffer = new Float32Array(0); this.isProcessing = false; } addData(data) { if (this.isProcessing) { // 如果正在处理，添加到后台缓冲区 this.backBuffer = this.concatArrays(this.backBuffer, data); } else { this.frontBuffer = this.concatArrays(this.frontBuffer, data); } } async process() { if (this.isProcessing || this.frontBuffer.length === 0) return; this.isProcessing = true; const processData = this.frontBuffer; this.frontBuffer = new Float32Array(0); // 处理数据... const result = await recognizer.processChunk(processData); this.isProcessing = false; // 交换缓冲区 if (this.backBuffer.length > 0) { this.frontBuffer = this.backBuffer; this.backBuffer = new Float32Array(0); this.process(); // 继续处理新数据 } return result; } }

5.2 内存管理优化

避免频繁的内存分配：

class AudioBufferPool { constructor(poolSize, bufferSize) { this.pool = []; for (let i = 0; i < poolSize; i++) { this.pool.push(new Float32Array(bufferSize)); } this.available = [...this.pool]; } acquire() { if (this.available.length === 0) { // 池耗尽，创建新缓冲区 return new Float32Array(this.pool[0].length); } return this.available.pop(); } release(buffer) { // 重置缓冲区 buffer.fill(0); this.available.push(buffer); } }

6. 跨浏览器兼容性处理

6.1 特性检测与降级方案

确保在各种浏览器中都能正常工作：

function checkBrowserCompatibility() { const compatibility = { mediaDevices: !!navigator.mediaDevices, getUserMedia: !!navigator.mediaDevices?.getUserMedia, audioWorklet: !!window.AudioWorkletNode, webGL: !!document.createElement('canvas').getContext('webgl'), wasm: typeof WebAssembly === 'object' }; if (!compatibility.mediaDevices || !compatibility.getUserMedia) { return { supported: false, reason: '浏览器不支持麦克风访问API' }; } if (!compatibility.webGL) { return { supported: false, reason: '浏览器不支持WebGL，无法加速模型推理' }; } return { supported: true }; }

6.2 备用方案实现

为不支持的浏览器提供备用方案：

async function setupFallback(stream) { // 使用传统的ScriptProcessorNode const audioContext = new AudioContext({ sampleRate: 16000 }); const source = audioContext.createMediaStreamSource(stream); const processor = audioContext.createScriptProcessor(4096, 1, 1); processor.onaudioprocess = (event) => { const inputData = event.inputBuffer.getChannelData(0); // 处理音频数据... processAudioData(inputData); }; source.connect(processor); processor.connect(audioContext.destination); return { audioContext, processor }; }

7. 完整实现示例

7.1 主控制类

class SpeechRecognizer { constructor() { this.isRecording = false; this.audioContext = null; this.workletNode = null; this.modelSession = null; this.recognizer = null; } async initialize() { try { // 检查浏览器兼容性 const compatibility = checkBrowserCompatibility(); if (!compatibility.supported) { throw new Error(compatibility.reason); } // 加载模型 this.modelSession = await loadModel(); this.recognizer = new StreamingRecognizer(this.modelSession); return true; } catch (error) { console.error('初始化失败:', error); return false; } } async startRecording() { if (this.isRecording) return; try { const stream = await setupMicrophone(); this.audioContext = new AudioContext({ sampleRate: 16000 }); // 设置音频处理 await this.setupAudioProcessing(stream); this.isRecording = true; this.onStatusChange?.(true); } catch (error) { console.error('开始录音失败:', error); throw error; } } async stopRecording() { if (!this.isRecording) return; this.isRecording = false; this.audioContext?.close(); this.audioContext = null; this.workletNode = null; this.onStatusChange?.(false); } async setupAudioProcessing(stream) { // 添加AudioWorklet模块 await this.audioContext.audioWorklet.addModule('audio-processor.js'); const source = this.audioContext.createMediaStreamSource(stream); this.workletNode = new AudioWorkletNode( this.audioContext, 'audio-processor'); // 处理音频数据 this.workletNode.port.onmessage = async (event) => { const audioData = event.data; const text = await this.recognizer.processChunk(audioData); if (text) { this.onTranscript?.(text); } }; source.connect(this.workletNode); this.workletNode.connect(this.audioContext.destination); } // 事件回调 onTranscript(text) {} onStatusChange(recording) {} }

7.2 使用示例

// 初始化语音识别器 const recognizer = new SpeechRecognizer(); // 设置回调 recognizer.onTranscript = (text) => { console.log('识别结果:', text); // 更新UI显示 document.getElementById('transcript').textContent += text + ' '; }; recognizer.onStatusChange = (recording) => { console.log('录音状态:', recording ? '开始' : '停止'); }; // 开始使用 async function setup() { const initialized = await recognizer.initialize(); if (initialized) { document.getElementById('startBtn').addEventListener('click', () => { recognizer.startRecording(); }); document.getElementById('stopBtn').addEventListener('click', () => { recognizer.stopRecording(); }); } } setup();

8. 实际应用建议

8.1 性能监控与调优

实时监控系统性能并及时调整：

class PerformanceMonitor { constructor() { this.stats = { inferenceTime: [], audioBufferSize: [], memoryUsage: [] }; this.startTime = performance.now(); } recordInferenceTime(time) { this.stats.inferenceTime.push(time); // 保持最近100个记录 if (this.stats.inferenceTime.length > 100) { this.stats.inferenceTime.shift(); } // 如果平均推理时间过长，调整缓冲区大小 const avgTime = this.stats.inferenceTime.reduce((a, b) => a + b) / this.stats.inferenceTime.length; if (avgTime > 100) { // 超过100ms this.adjustBufferSize('decrease'); } } adjustBufferSize(direction) { // 根据性能调整缓冲区大小 } }

8.2 错误处理与恢复

完善的错误处理机制：

class ErrorHandler { static async withRetry(operation, maxRetries = 3) { let lastError; for (let attempt = 1; attempt <= maxRetries; attempt++) { try { return await operation(); } catch (error) { lastError = error; console.warn(`操作失败，尝试 ${attempt}/${maxRetries}:`, error); if (attempt < maxRetries) { // 指数退避 await new Promise(resolve => setTimeout(resolve, Math.pow(2, attempt) * 1000)); } } } throw lastError; } static handleRecognitionError(error) { // 根据错误类型采取不同措施 if (error.name === 'NotAllowedError') { // 用户拒绝麦克风权限 showPermissionPrompt(); } else if (error.message.includes('network')) { // 网络错误 showNetworkError(); } else { // 其他错误 console.error('识别错误:', error); } } }