From 631cdafdb5fd167626199f184d4bc9489950d77f Mon Sep 17 00:00:00 2001
From: Till JS <tills95@gmail.com>
Date: Sun, 12 Apr 2026 16:17:56 +0200
Subject: [PATCH] feat(voice): route STT through local Whisper when model is
 loaded
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

transcribeAudio() now checks localSTT.isReady before falling back to
the server-side mana-stt proxy. When local STT is active, audio blobs
are decoded to Float32Array via AudioContext.decodeAudioData() and
transcribed entirely on-device. The returned model field shows
"Whisper Tiny (lokal)" or similar so every module (dreams, memoro,
habits) displays which backend was used — no module code changed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../mana/apps/web/src/lib/voice/transcribe.ts | 70 +++++++++++++++++--
 1 file changed, 64 insertions(+), 6 deletions(-)
diff --git a/apps/mana/apps/web/src/lib/voice/transcribe.ts b/apps/mana/apps/web/src/lib/voice/transcribe.ts
index 99290f65b..9a8228f1e 100644
--- a/apps/mana/apps/web/src/lib/voice/transcribe.ts
+++ b/apps/mana/apps/web/src/lib/voice/transcribe.ts
@@ -2,25 +2,83 @@
  * Shared voice transcription helper.
  *
  * All modules that need speech-to-text use this single function instead
- * of inlining the fetch call. It posts the audio blob to the SvelteKit
- * proxy at /api/v1/voice/transcribe, which forwards to mana-stt.
+ * of inlining the fetch call. It routes to either:
+ *
+ *   1. **Local STT** (@mana/local-stt, Whisper in browser via WebGPU)
+ *      when the model is loaded and ready — fully on-device, no network.
+ *   2. **Server STT** (mana-stt via /api/v1/voice/transcribe) as
+ *      fallback when local STT is not available/loaded.
+ *
+ * The returned `model` field tells the caller which backend was used
+ * (e.g. "whisper-tiny (lokal)" vs "whisperx-large-v3").
  */
 
+import { localSTT } from '@mana/local-stt';
+
 export interface TranscribeResult {
 	text: string;
 	language: string | null;
 	durationSeconds: number | null;
-	/** STT backend/model identifier returned by mana-stt (e.g. "whisperx-large-v3"). */
+	/** STT backend/model identifier (e.g. "whisper-tiny (lokal)" or "whisperx-large-v3"). */
 	model: string | null;
 }
 
 /**
- * Transcribe an audio blob via the server-side STT proxy.
+ * Transcribe an audio blob. Routes to local STT if available,
+ * otherwise falls back to the server-side STT proxy.
  *
- * @throws on HTTP errors or network failures — callers are expected to
- *   handle errors in a module-specific way (update status, show toast, etc.).
+ * @throws on errors — callers handle them in a module-specific way.
  */
 export async function transcribeAudio(blob: Blob, language?: string): Promise<TranscribeResult> {
+	// Prefer local STT when the model is loaded and ready
+	if (localSTT.isReady) {
+		return transcribeLocal(blob, language);
+	}
+	return transcribeServer(blob, language);
+}
+
+// ─── Local STT (Whisper in browser) ────────────────────────────
+
+async function transcribeLocal(blob: Blob, language?: string): Promise<TranscribeResult> {
+	const audio = await blobToFloat32(blob);
+	const durationSeconds = audio.length / 16000;
+
+	const result = await localSTT.transcribe({
+		audio,
+		language,
+	});
+
+	const modelName = localSTT.modelConfig?.displayName ?? 'Whisper';
+
+	return {
+		text: (result.text ?? '').trim(),
+		language: result.language ?? language ?? null,
+		durationSeconds,
+		model: `${modelName} (lokal)`,
+	};
+}
+
+/**
+ * Decode an audio Blob (webm/opus, mp4, etc.) into Float32Array at 16 kHz mono
+ * using the browser's built-in AudioContext decoder. This avoids needing
+ * ffmpeg or any external library.
+ */
+async function blobToFloat32(blob: Blob): Promise<Float32Array> {
+	const arrayBuffer = await blob.arrayBuffer();
+	const audioContext = new AudioContext({ sampleRate: 16000 });
+
+	try {
+		const decoded = await audioContext.decodeAudioData(arrayBuffer);
+		// Take the first channel (mono)
+		return decoded.getChannelData(0);
+	} finally {
+		await audioContext.close();
+	}
+}
+
+// ─── Server STT (mana-stt proxy) ──────────────────────────────
+
+async function transcribeServer(blob: Blob, language?: string): Promise<TranscribeResult> {
 	const form = new FormData();
 	const ext = blob.type.includes('webm') ? '.webm' : blob.type.includes('mp4') ? '.m4a' : '.audio';
 	form.append('file', blob, `voice${ext}`);