From 631cdafdb5fd167626199f184d4bc9489950d77f Mon Sep 17 00:00:00 2001 From: Till JS Date: Sun, 12 Apr 2026 16:17:56 +0200 Subject: [PATCH] feat(voice): route STT through local Whisper when model is loaded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit transcribeAudio() now checks localSTT.isReady before falling back to the server-side mana-stt proxy. When local STT is active, audio blobs are decoded to Float32Array via AudioContext.decodeAudioData() and transcribed entirely on-device. The returned model field shows "Whisper Tiny (lokal)" or similar so every module (dreams, memoro, habits) displays which backend was used — no module code changed. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../mana/apps/web/src/lib/voice/transcribe.ts | 70 +++++++++++++++++-- 1 file changed, 64 insertions(+), 6 deletions(-) diff --git a/apps/mana/apps/web/src/lib/voice/transcribe.ts b/apps/mana/apps/web/src/lib/voice/transcribe.ts index 99290f65b..9a8228f1e 100644 --- a/apps/mana/apps/web/src/lib/voice/transcribe.ts +++ b/apps/mana/apps/web/src/lib/voice/transcribe.ts @@ -2,25 +2,83 @@ * Shared voice transcription helper. * * All modules that need speech-to-text use this single function instead - * of inlining the fetch call. It posts the audio blob to the SvelteKit - * proxy at /api/v1/voice/transcribe, which forwards to mana-stt. + * of inlining the fetch call. It routes to either: + * + * 1. **Local STT** (@mana/local-stt, Whisper in browser via WebGPU) + * when the model is loaded and ready — fully on-device, no network. + * 2. **Server STT** (mana-stt via /api/v1/voice/transcribe) as + * fallback when local STT is not available/loaded. + * + * The returned `model` field tells the caller which backend was used + * (e.g. "whisper-tiny (lokal)" vs "whisperx-large-v3"). */ +import { localSTT } from '@mana/local-stt'; + export interface TranscribeResult { text: string; language: string | null; durationSeconds: number | null; - /** STT backend/model identifier returned by mana-stt (e.g. "whisperx-large-v3"). */ + /** STT backend/model identifier (e.g. "whisper-tiny (lokal)" or "whisperx-large-v3"). */ model: string | null; } /** - * Transcribe an audio blob via the server-side STT proxy. + * Transcribe an audio blob. Routes to local STT if available, + * otherwise falls back to the server-side STT proxy. * - * @throws on HTTP errors or network failures — callers are expected to - * handle errors in a module-specific way (update status, show toast, etc.). + * @throws on errors — callers handle them in a module-specific way. */ export async function transcribeAudio(blob: Blob, language?: string): Promise { + // Prefer local STT when the model is loaded and ready + if (localSTT.isReady) { + return transcribeLocal(blob, language); + } + return transcribeServer(blob, language); +} + +// ─── Local STT (Whisper in browser) ──────────────────────────── + +async function transcribeLocal(blob: Blob, language?: string): Promise { + const audio = await blobToFloat32(blob); + const durationSeconds = audio.length / 16000; + + const result = await localSTT.transcribe({ + audio, + language, + }); + + const modelName = localSTT.modelConfig?.displayName ?? 'Whisper'; + + return { + text: (result.text ?? '').trim(), + language: result.language ?? language ?? null, + durationSeconds, + model: `${modelName} (lokal)`, + }; +} + +/** + * Decode an audio Blob (webm/opus, mp4, etc.) into Float32Array at 16 kHz mono + * using the browser's built-in AudioContext decoder. This avoids needing + * ffmpeg or any external library. + */ +async function blobToFloat32(blob: Blob): Promise { + const arrayBuffer = await blob.arrayBuffer(); + const audioContext = new AudioContext({ sampleRate: 16000 }); + + try { + const decoded = await audioContext.decodeAudioData(arrayBuffer); + // Take the first channel (mono) + return decoded.getChannelData(0); + } finally { + await audioContext.close(); + } +} + +// ─── Server STT (mana-stt proxy) ────────────────────────────── + +async function transcribeServer(blob: Blob, language?: string): Promise { const form = new FormData(); const ext = blob.type.includes('webm') ? '.webm' : blob.type.includes('mp4') ? '.m4a' : '.audio'; form.append('file', blob, `voice${ext}`);