feat(voice): route STT through local Whisper when model is loaded

transcribeAudio() now checks localSTT.isReady before falling back to the server-side mana-stt proxy. When local STT is active, audio blobs are decoded to Float32Array via AudioContext.decodeAudioData() and transcribed entirely on-device. The returned model field shows "Whisper Tiny (lokal)" or similar so every module (dreams, memoro, habits) displays which backend was used — no module code changed. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 19:41:09 +02:00 · 2026-04-12 16:17:56 +02:00 · 2026-04-12 16:17:56 +02:00 · 631cdafdb5
commit 631cdafdb5
parent 14d11272c9
1 changed files with 64 additions and 6 deletions
--- a/apps/mana/apps/web/src/lib/voice/transcribe.ts
+++ b/apps/mana/apps/web/src/lib/voice/transcribe.ts
@ -2,25 +2,83 @@
 * Shared voice transcription helper.
 *
 * All modules that need speech-to-text use this single function instead
- * of inlining the fetch call. It posts the audio blob to the SvelteKit
- * proxy at /api/v1/voice/transcribe, which forwards to mana-stt.
+ * of inlining the fetch call. It routes to either:
+ *
+ *   1. **Local STT** (@mana/local-stt, Whisper in browser via WebGPU)
+ *      when the model is loaded and ready — fully on-device, no network.
+ *   2. **Server STT** (mana-stt via /api/v1/voice/transcribe) as
+ *      fallback when local STT is not available/loaded.
+ *
+ * The returned `model` field tells the caller which backend was used
+ * (e.g. "whisper-tiny (lokal)" vs "whisperx-large-v3").
 */

+import { localSTT } from '@mana/local-stt';
+
 export interface TranscribeResult {
 	text: string;
 	language: string | null;
 	durationSeconds: number | null;
-	/** STT backend/model identifier returned by mana-stt (e.g. "whisperx-large-v3"). */
+	/** STT backend/model identifier (e.g. "whisper-tiny (lokal)" or "whisperx-large-v3"). */
 	model: string | null;
 }

 /**
- * Transcribe an audio blob via the server-side STT proxy.
+ * Transcribe an audio blob. Routes to local STT if available,
+ * otherwise falls back to the server-side STT proxy.
 *
- * @throws on HTTP errors or network failures — callers are expected to
- *   handle errors in a module-specific way (update status, show toast, etc.).
+ * @throws on errors — callers handle them in a module-specific way.
 */
 export async function transcribeAudio(blob: Blob, language?: string): Promise<TranscribeResult> {
+	// Prefer local STT when the model is loaded and ready
+	if (localSTT.isReady) {
+		return transcribeLocal(blob, language);
+	}
+	return transcribeServer(blob, language);
+}
+
+// ─── Local STT (Whisper in browser) ────────────────────────────
+
+async function transcribeLocal(blob: Blob, language?: string): Promise<TranscribeResult> {
+	const audio = await blobToFloat32(blob);
+	const durationSeconds = audio.length / 16000;
+
+	const result = await localSTT.transcribe({
+		audio,
+		language,
+	});
+
+	const modelName = localSTT.modelConfig?.displayName ?? 'Whisper';
+
+	return {
+		text: (result.text ?? '').trim(),
+		language: result.language ?? language ?? null,
+		durationSeconds,
+		model: `${modelName} (lokal)`,
+	};
+}
+
+/**
+ * Decode an audio Blob (webm/opus, mp4, etc.) into Float32Array at 16 kHz mono
+ * using the browser's built-in AudioContext decoder. This avoids needing
+ * ffmpeg or any external library.
+ */
+async function blobToFloat32(blob: Blob): Promise<Float32Array> {
+	const arrayBuffer = await blob.arrayBuffer();
+	const audioContext = new AudioContext({ sampleRate: 16000 });
+
+	try {
+		const decoded = await audioContext.decodeAudioData(arrayBuffer);
+		// Take the first channel (mono)
+		return decoded.getChannelData(0);
+	} finally {
+		await audioContext.close();
+	}
+}
+
+// ─── Server STT (mana-stt proxy) ──────────────────────────────
+
+async function transcribeServer(blob: Blob, language?: string): Promise<TranscribeResult> {
 	const form = new FormData();
 	const ext = blob.type.includes('webm') ? '.webm' : blob.type.includes('mp4') ? '.m4a' : '.audio';
 	form.append('file', blob, `voice${ext}`);