mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 19:41:09 +02:00
feat(voice): route STT through local Whisper when model is loaded
transcribeAudio() now checks localSTT.isReady before falling back to the server-side mana-stt proxy. When local STT is active, audio blobs are decoded to Float32Array via AudioContext.decodeAudioData() and transcribed entirely on-device. The returned model field shows "Whisper Tiny (lokal)" or similar so every module (dreams, memoro, habits) displays which backend was used — no module code changed. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
14d11272c9
commit
631cdafdb5
1 changed files with 64 additions and 6 deletions
|
|
@ -2,25 +2,83 @@
|
|||
* Shared voice transcription helper.
|
||||
*
|
||||
* All modules that need speech-to-text use this single function instead
|
||||
* of inlining the fetch call. It posts the audio blob to the SvelteKit
|
||||
* proxy at /api/v1/voice/transcribe, which forwards to mana-stt.
|
||||
* of inlining the fetch call. It routes to either:
|
||||
*
|
||||
* 1. **Local STT** (@mana/local-stt, Whisper in browser via WebGPU)
|
||||
* when the model is loaded and ready — fully on-device, no network.
|
||||
* 2. **Server STT** (mana-stt via /api/v1/voice/transcribe) as
|
||||
* fallback when local STT is not available/loaded.
|
||||
*
|
||||
* The returned `model` field tells the caller which backend was used
|
||||
* (e.g. "whisper-tiny (lokal)" vs "whisperx-large-v3").
|
||||
*/
|
||||
|
||||
import { localSTT } from '@mana/local-stt';
|
||||
|
||||
export interface TranscribeResult {
|
||||
text: string;
|
||||
language: string | null;
|
||||
durationSeconds: number | null;
|
||||
/** STT backend/model identifier returned by mana-stt (e.g. "whisperx-large-v3"). */
|
||||
/** STT backend/model identifier (e.g. "whisper-tiny (lokal)" or "whisperx-large-v3"). */
|
||||
model: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe an audio blob via the server-side STT proxy.
|
||||
* Transcribe an audio blob. Routes to local STT if available,
|
||||
* otherwise falls back to the server-side STT proxy.
|
||||
*
|
||||
* @throws on HTTP errors or network failures — callers are expected to
|
||||
* handle errors in a module-specific way (update status, show toast, etc.).
|
||||
* @throws on errors — callers handle them in a module-specific way.
|
||||
*/
|
||||
export async function transcribeAudio(blob: Blob, language?: string): Promise<TranscribeResult> {
|
||||
// Prefer local STT when the model is loaded and ready
|
||||
if (localSTT.isReady) {
|
||||
return transcribeLocal(blob, language);
|
||||
}
|
||||
return transcribeServer(blob, language);
|
||||
}
|
||||
|
||||
// ─── Local STT (Whisper in browser) ────────────────────────────
|
||||
|
||||
async function transcribeLocal(blob: Blob, language?: string): Promise<TranscribeResult> {
|
||||
const audio = await blobToFloat32(blob);
|
||||
const durationSeconds = audio.length / 16000;
|
||||
|
||||
const result = await localSTT.transcribe({
|
||||
audio,
|
||||
language,
|
||||
});
|
||||
|
||||
const modelName = localSTT.modelConfig?.displayName ?? 'Whisper';
|
||||
|
||||
return {
|
||||
text: (result.text ?? '').trim(),
|
||||
language: result.language ?? language ?? null,
|
||||
durationSeconds,
|
||||
model: `${modelName} (lokal)`,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode an audio Blob (webm/opus, mp4, etc.) into Float32Array at 16 kHz mono
|
||||
* using the browser's built-in AudioContext decoder. This avoids needing
|
||||
* ffmpeg or any external library.
|
||||
*/
|
||||
async function blobToFloat32(blob: Blob): Promise<Float32Array> {
|
||||
const arrayBuffer = await blob.arrayBuffer();
|
||||
const audioContext = new AudioContext({ sampleRate: 16000 });
|
||||
|
||||
try {
|
||||
const decoded = await audioContext.decodeAudioData(arrayBuffer);
|
||||
// Take the first channel (mono)
|
||||
return decoded.getChannelData(0);
|
||||
} finally {
|
||||
await audioContext.close();
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Server STT (mana-stt proxy) ──────────────────────────────
|
||||
|
||||
async function transcribeServer(blob: Blob, language?: string): Promise<TranscribeResult> {
|
||||
const form = new FormData();
|
||||
const ext = blob.type.includes('webm') ? '.webm' : blob.type.includes('mp4') ? '.m4a' : '.audio';
|
||||
form.append('file', blob, `voice${ext}`);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue