feat(memoro): voice recording → mana-stt transcription pipeline

Adds end-to-end browser voice capture for the Memoro module, mirroring the
existing dreams pattern: MediaRecorder → SvelteKit server proxy → mana-stt
on the Windows GPU box via Cloudflare tunnel.

Recording UI lives in /memoro page header (mic button + live timer + cancel +
sticky-permission retry). Server proxy at /api/v1/memoro/transcribe forwards
the blob with the server-held X-API-Key. memosStore.createFromVoice creates a
placeholder memo with processingStatus='processing' and fires transcribeBlob
in the background, which writes the transcript and flips status on completion
(or 'failed' with error in metadata).

Also corrects the mana-stt hostname across the repo: stt-api.mana.how (which
never existed in DNS) → gpu-stt.mana.how (the actual Cloudflare tunnel route
to the Windows GPU box). Adds an ENVIRONMENT_VARIABLES.md section explaining
how to obtain MANA_STT_API_KEY and where the tunnel terminates. Adds tunnel
health probes to the mac-mini health-check script so we catch tunnel-side
breakage in addition to LAN-side.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-07 18:48:41 +02:00
parent 4d9bf78f41
commit c5aeaf5e7f
9 changed files with 568 additions and 8 deletions

View file

@ -288,10 +288,11 @@ CALENDAR_BACKEND_URL=http://localhost:3014
CALENDAR_DATABASE_URL=postgresql://mana:devpassword@localhost:5432/mana_platform
# Speech-to-Text Service (mana-stt)
# Production: https://stt-api.mana.how
# Local dev: http://localhost:3020
STT_URL=https://stt-api.mana.how
# API key for mana-stt (set in your local .env, never commit a real key)
# Production: https://gpu-stt.mana.how (Cloudflare tunnel → Windows GPU box)
# Local dev: http://localhost:3020 (or http://192.168.178.11:3020 from LAN)
STT_URL=https://gpu-stt.mana.how
# API key for mana-stt — DO NOT COMMIT a real key.
# See docs/ENVIRONMENT_VARIABLES.md for where to obtain it.
MANA_STT_API_KEY=
# ============================================

View file

@ -0,0 +1,245 @@
/**
* Browser audio recorder for the Memoro voice-capture feature.
*
* Uses MediaRecorder under the hood. Exposes a small reactive state object
* that components can read to render the mic button state and elapsed time.
*/
export type RecorderStatus = 'idle' | 'requesting' | 'recording' | 'stopping';
export interface RecordingResult {
blob: Blob;
durationMs: number;
mimeType: string;
}
class MemoRecorder {
status = $state<RecorderStatus>('idle');
error = $state<string | null>(null);
elapsedMs = $state(0);
#mediaRecorder: MediaRecorder | null = null;
#stream: MediaStream | null = null;
#chunks: Blob[] = [];
#startedAt = 0;
#tickHandle: ReturnType<typeof setInterval> | null = null;
#resolve: ((result: RecordingResult) => void) | null = null;
#reject: ((reason: Error) => void) | null = null;
get isAvailable(): boolean {
return (
typeof navigator !== 'undefined' &&
!!navigator.mediaDevices?.getUserMedia &&
typeof MediaRecorder !== 'undefined'
);
}
get isSecureContext(): boolean {
return typeof window !== 'undefined' && window.isSecureContext === true;
}
async start(options: { force?: boolean } = {}): Promise<void> {
if (this.status !== 'idle') return;
if (!this.isSecureContext) {
const host = typeof window !== 'undefined' ? window.location.host : '';
this.error = `Mikrofon-Zugriff braucht eine sichere Verbindung. Öffne die App über https:// oder http://localhost statt http://${host}.`;
return;
}
if (!this.isAvailable) {
this.error = 'Audio-Aufnahme wird in diesem Browser nicht unterstützt.';
return;
}
if (!options.force) {
const stickyDenied = await this.#checkStickyDeny();
if (stickyDenied) {
this.error = this.#stickyDenyMessage();
return;
}
}
this.error = null;
this.status = 'requesting';
try {
this.#stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
});
} catch (e) {
this.error = this.#explainError(e);
this.status = 'idle';
return;
}
const mimeType = pickSupportedMimeType();
try {
this.#mediaRecorder = new MediaRecorder(this.#stream, mimeType ? { mimeType } : {});
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
this.error = `MediaRecorder konnte nicht gestartet werden: ${msg}`;
this.#cleanupStream();
this.status = 'idle';
return;
}
this.#chunks = [];
this.#mediaRecorder.ondataavailable = (event) => {
if (event.data && event.data.size > 0) this.#chunks.push(event.data);
};
this.#mediaRecorder.onerror = (event: Event) => {
const err = (event as Event & { error?: Error }).error;
this.#failWith(err ?? new Error('MediaRecorder error'));
};
this.#mediaRecorder.onstop = () => {
const durationMs = this.elapsedMs;
const type = this.#mediaRecorder?.mimeType || mimeType || 'audio/webm';
const blob = new Blob(this.#chunks, { type });
this.#cleanupStream();
this.#cleanupTimer();
this.status = 'idle';
this.elapsedMs = 0;
const resolve = this.#resolve;
this.#resolve = null;
this.#reject = null;
resolve?.({ blob, durationMs, mimeType: type });
};
this.#startedAt = Date.now();
this.elapsedMs = 0;
this.#tickHandle = setInterval(() => {
this.elapsedMs = Date.now() - this.#startedAt;
}, 100);
this.#mediaRecorder.start();
this.status = 'recording';
}
stop(): Promise<RecordingResult> {
if (this.status !== 'recording' || !this.#mediaRecorder) {
return Promise.reject(new Error('Not recording'));
}
this.status = 'stopping';
return new Promise<RecordingResult>((resolve, reject) => {
this.#resolve = resolve;
this.#reject = reject;
this.#mediaRecorder?.stop();
});
}
cancel(): void {
if (this.status === 'idle') return;
this.#cleanupStream();
this.#cleanupTimer();
this.#mediaRecorder = null;
this.#chunks = [];
this.elapsedMs = 0;
this.status = 'idle';
const reject = this.#reject;
this.#resolve = null;
this.#reject = null;
reject?.(new Error('cancelled'));
}
#failWith(err: Error) {
this.error = err.message;
this.#cleanupStream();
this.#cleanupTimer();
this.status = 'idle';
this.elapsedMs = 0;
const reject = this.#reject;
this.#resolve = null;
this.#reject = null;
reject?.(err);
}
#stickyDenyMessage(): string {
const isMac =
typeof navigator !== 'undefined' && /Mac|iPhone|iPad/i.test(navigator.platform || '');
if (isMac) {
return [
'Mikrofon-Zugriff blockiert. Auf macOS hat das fast immer eine von zwei Ursachen:',
'1) System-Einstellungen → Datenschutz & Sicherheit → Mikrofon: dein Browser muss in der Liste aktiviert sein. Wenn er fehlt oder deaktiviert ist, schalte ihn ein und starte den Browser komplett neu (Cmd+Q, nicht nur Tab schließen).',
'2) Browser-Einstellung: chrome://settings/content/microphone (Chrome) oder about:preferences#privacy (Firefox) → "localhost" darf nicht in der Block-Liste stehen.',
'Tipp: Klicke auf "Trotzdem versuchen" um den exakten Browser-Fehler zu sehen.',
].join('\n');
}
return [
'Mikrofon-Zugriff blockiert. Mögliche Ursachen:',
'1) Browser-Einstellungen → Mikrofon → "localhost" darf nicht blockiert sein.',
'2) System-Einstellungen → Datenschutz → Mikrofon → Browser muss erlaubt sein.',
'Tipp: Klicke auf "Trotzdem versuchen" um den exakten Browser-Fehler zu sehen.',
].join('\n');
}
async #checkStickyDeny(): Promise<boolean> {
try {
const perms = (
navigator as Navigator & {
permissions?: {
query: (descriptor: { name: string }) => Promise<{ state: string }>;
};
}
).permissions;
if (!perms?.query) return false;
const status = await perms.query({ name: 'microphone' });
return status.state === 'denied';
} catch {
return false;
}
}
#explainError(e: unknown): string {
const err = e instanceof Error ? e : new Error(String(e));
const name = err.name || '';
const msg = err.message || '';
if (name === 'NotAllowedError' || /denied|permission/i.test(msg)) {
return 'Mikrofon-Zugriff wurde verweigert. Klicke in der Adressleiste auf das Schloss-Symbol und erlaube den Zugriff.';
}
if (name === 'NotFoundError' || /not.?found|no.?device/i.test(msg)) {
return 'Kein Mikrofon gefunden. Schließe ein Mikrofon an oder prüfe deine System-Einstellungen.';
}
if (name === 'NotReadableError' || /in use|busy/i.test(msg)) {
return 'Mikrofon ist gerade von einer anderen Anwendung belegt.';
}
if (name === 'SecurityError') {
return 'Mikrofon-Zugriff vom Browser blockiert (Sicherheitsrichtlinie).';
}
return `Mikrofon konnte nicht geöffnet werden: ${msg || name || 'Unbekannter Fehler'}`;
}
#cleanupStream() {
this.#stream?.getTracks().forEach((t) => t.stop());
this.#stream = null;
}
#cleanupTimer() {
if (this.#tickHandle !== null) {
clearInterval(this.#tickHandle);
this.#tickHandle = null;
}
}
}
function pickSupportedMimeType(): string | null {
if (typeof MediaRecorder === 'undefined') return null;
const candidates = ['audio/webm;codecs=opus', 'audio/webm', 'audio/ogg;codecs=opus', 'audio/mp4'];
for (const c of candidates) {
if (MediaRecorder.isTypeSupported(c)) return c;
}
return null;
}
export const memoRecorder = new MemoRecorder();
export function formatElapsed(ms: number): string {
const totalSec = Math.floor(ms / 1000);
const min = Math.floor(totalSec / 60);
const sec = totalSec % 60;
return `${min}:${sec.toString().padStart(2, '0')}`;
}

View file

@ -23,14 +23,16 @@ export const memosStore = {
transcript?: string;
language?: string;
blueprintId?: string;
audioDurationMs?: number;
processingStatus?: LocalMemo['processingStatus'];
}) {
const newLocal: LocalMemo = {
id: crypto.randomUUID(),
title: data.title ?? null,
intro: null,
transcript: data.transcript ?? null,
audioDurationMs: null,
processingStatus: data.transcript ? 'completed' : 'pending',
audioDurationMs: data.audioDurationMs ?? null,
processingStatus: data.processingStatus ?? (data.transcript ? 'completed' : 'pending'),
isArchived: false,
isPinned: false,
isPublic: false,
@ -42,6 +44,73 @@ export const memosStore = {
return toMemo(newLocal);
},
/**
* Create a placeholder memo from a fresh voice recording and start the
* background transcription. Returns the new memo immediately so the UI
* can navigate / show a "processing" state without waiting.
*/
async createFromVoice(blob: Blob, durationMs: number, language?: string) {
const memo = await this.create({
audioDurationMs: durationMs,
language,
processingStatus: 'processing',
});
// Fire and forget — transcription updates the memo when it returns.
void this.transcribeBlob(memo.id, blob, language);
return memo;
},
/**
* Upload an audio blob to /api/v1/memoro/transcribe and write the result
* back into the memo. Marks completed on success, failed on error.
*/
async transcribeBlob(memoId: string, blob: Blob, language?: string): Promise<void> {
try {
const form = new FormData();
const ext = blob.type.includes('webm')
? '.webm'
: blob.type.includes('mp4')
? '.m4a'
: '.audio';
form.append('file', blob, `memo${ext}`);
if (language) form.append('language', language);
const response = await fetch('/api/v1/memoro/transcribe', {
method: 'POST',
body: form,
});
if (!response.ok) {
const text = await response.text();
throw new Error(text || `HTTP ${response.status}`);
}
const result = (await response.json()) as {
text: string;
language: string | null;
durationSeconds: number | null;
};
const transcript = (result.text ?? '').trim();
const existing = await memoTable.get(memoId);
if (!existing) return;
await memoTable.update(memoId, {
transcript,
language: existing.language ?? result.language ?? null,
processingStatus: 'completed',
updatedAt: new Date().toISOString(),
});
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
await memoTable.update(memoId, {
processingStatus: 'failed',
metadata: { ...(((await memoTable.get(memoId))?.metadata as object) ?? {}), error: msg },
updatedAt: new Date().toISOString(),
});
}
},
/** Update a memo's fields. */
async update(
id: string,

View file

@ -275,7 +275,7 @@
<div>
<p class="text-sm font-medium mb-2">Speech-to-Text (STT)</p>
<pre class="bg-muted p-3 rounded-lg text-sm overflow-x-auto"><code
>curl -X POST https://stt-api.mana.how/transcribe \
>curl -X POST https://gpu-stt.mana.how/transcribe \
-H "X-API-Key: sk_live_your_key_here" \
-F "audio=@audio.mp3"</code
></pre>

View file

@ -2,6 +2,7 @@
import { goto } from '$app/navigation';
import { getContext } from 'svelte';
import { memosStore } from '$lib/modules/memoro/stores/memos.svelte';
import { memoRecorder, formatElapsed } from '$lib/modules/memoro/recorder.svelte';
import {
filterBySearch,
filterByTag,
@ -43,6 +44,44 @@
goto(`/memoro/${memo.id}`);
}
// ── Voice capture ─────────────────────────────────────────
let recError = $state<string | null>(null);
async function handleMicClick() {
recError = null;
if (memoRecorder.status === 'recording') {
try {
const result = await memoRecorder.stop();
if (result.durationMs < 500) {
recError = 'Aufnahme war zu kurz.';
return;
}
const memo = await memosStore.createFromVoice(result.blob, result.durationMs, 'de');
goto(`/memoro/${memo.id}`);
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
if (msg !== 'cancelled') recError = msg;
}
} else if (memoRecorder.status === 'idle') {
await memoRecorder.start();
if (memoRecorder.error) {
recError = memoRecorder.error;
}
}
}
async function forceRetryMic() {
recError = null;
await memoRecorder.start({ force: true });
if (memoRecorder.error) {
recError = memoRecorder.error;
}
}
function cancelRecording() {
memoRecorder.cancel();
}
async function handlePin(e: Event, id: string, isPinned: boolean) {
e.stopPropagation();
if (isPinned) {
@ -94,6 +133,41 @@
<TagIcon size={16} />
Tags
</a>
<button
onclick={handleMicClick}
disabled={memoRecorder.status === 'requesting' || memoRecorder.status === 'stopping'}
aria-label={memoRecorder.status === 'recording' ? 'Aufnahme beenden' : 'Aufnahme starten'}
class="flex items-center gap-2 rounded-lg px-4 py-2 text-sm font-medium transition-colors disabled:opacity-60"
class:recording={memoRecorder.status === 'recording'}
style:background-color={memoRecorder.status === 'recording'
? '#ef4444'
: 'hsl(var(--muted))'}
style:color={memoRecorder.status === 'recording' ? 'white' : 'hsl(var(--foreground))'}
>
{#if memoRecorder.status === 'recording'}
<span class="rec-dot"></span>
{formatElapsed(memoRecorder.elapsedMs)}
{:else if memoRecorder.status === 'requesting'}
<Microphone size={18} />
Mikro öffnen…
{:else if memoRecorder.status === 'stopping'}
<Microphone size={18} />
Verarbeite…
{:else}
<Microphone size={18} />
Aufnehmen
{/if}
</button>
{#if memoRecorder.status === 'recording'}
<button
onclick={cancelRecording}
title="Aufnahme verwerfen"
aria-label="Aufnahme verwerfen"
class="rounded-lg border border-[hsl(var(--border))] px-3 py-2 text-sm text-[hsl(var(--muted-foreground))] hover:bg-[hsl(var(--muted))]"
>
×
</button>
{/if}
<button
onclick={handleNewMemo}
class="flex items-center gap-2 rounded-lg bg-[hsl(var(--primary))] px-4 py-2 text-sm font-medium text-[hsl(var(--primary-foreground))] transition-colors hover:opacity-90"
@ -104,6 +178,17 @@
</div>
</div>
{#if recError}
<div
class="rounded-lg border border-red-500/30 bg-red-500/10 p-3 text-sm text-red-600 dark:text-red-300"
>
<p class="whitespace-pre-line">{recError}</p>
<button onclick={forceRetryMic} class="mt-2 text-xs font-medium underline hover:no-underline">
Trotzdem versuchen
</button>
</div>
{/if}
<!-- Search -->
<div class="relative">
<MagnifyingGlass
@ -260,3 +345,23 @@
</a>
</div>
</div>
<style>
.rec-dot {
display: inline-block;
width: 0.625rem;
height: 0.625rem;
border-radius: 9999px;
background: white;
animation: pulse 1s ease-in-out infinite;
}
@keyframes pulse {
0%,
100% {
opacity: 1;
}
50% {
opacity: 0.4;
}
}
</style>

View file

@ -0,0 +1,102 @@
/**
* POST /api/v1/memoro/transcribe
*
* Server-side proxy to mana-stt for the Memoro module's voice capture.
* The browser uploads an audio Blob; we forward it to mana-stt with the
* server-held API key and return the transcript JSON.
*
* Request: multipart/form-data with `file` (audio blob) and optional `language`
* Response: { text: string, language: string | null, durationSeconds: number | null }
*/
import { error, json } from '@sveltejs/kit';
import { env } from '$env/dynamic/private';
import type { RequestHandler } from './$types';
const MAX_BYTES = 25 * 1024 * 1024; // 25 MB
function isAcceptableType(mime: string): boolean {
if (!mime) return true; // tolerate missing type — let upstream validate
if (mime === 'application/octet-stream') return true;
return mime.startsWith('audio/') || mime.startsWith('video/'); // m4a often reports video/mp4
}
export const POST: RequestHandler = async ({ request }) => {
const sttUrl = env.MANA_STT_URL;
const apiKey = env.MANA_STT_API_KEY;
if (!sttUrl) {
throw error(503, 'mana-stt is not configured (MANA_STT_URL missing)');
}
let incoming: FormData;
try {
incoming = await request.formData();
} catch {
throw error(400, 'Expected multipart/form-data with a file field');
}
const file = incoming.get('file');
const language = (incoming.get('language') as string | null) ?? null;
if (!(file instanceof Blob)) {
throw error(400, 'Missing file');
}
if (file.size === 0) {
throw error(400, 'Empty audio');
}
if (file.size > MAX_BYTES) {
throw error(413, `Audio too large (max ${MAX_BYTES / 1024 / 1024} MB)`);
}
if (!isAcceptableType(file.type)) {
throw error(415, `Unsupported audio type: ${file.type}`);
}
const ext = mimeToExtension(file.type);
const filename = `memo${ext}`;
const upstream = new FormData();
upstream.append('file', file, filename);
if (language) upstream.append('language', language);
const headers: Record<string, string> = { Accept: 'application/json' };
if (apiKey) headers['X-API-Key'] = apiKey;
let response: Response;
try {
response = await fetch(`${sttUrl.replace(/\/$/, '')}/transcribe`, {
method: 'POST',
headers,
body: upstream,
});
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
throw error(502, `Could not reach mana-stt: ${msg}`);
}
if (!response.ok) {
const text = await response.text();
throw error(response.status, `mana-stt error: ${text || response.statusText}`);
}
const result = (await response.json()) as {
text: string;
language?: string;
duration_seconds?: number;
};
return json({
text: result.text ?? '',
language: result.language ?? null,
durationSeconds: result.duration_seconds ?? null,
});
};
function mimeToExtension(mime: string): string {
if (mime.includes('webm')) return '.webm';
if (mime.includes('ogg')) return '.ogg';
if (mime.includes('mp4') || mime.includes('m4a')) return '.m4a';
if (mime.includes('mpeg')) return '.mp3';
if (mime.includes('wav')) return '.wav';
if (mime.includes('flac')) return '.flac';
return '.webm';
}

View file

@ -111,6 +111,34 @@ The generator reads `.env.development` and creates app-specific `.env` files wit
| `CARDS_SUPABASE_URL` | Supabase project URL | - |
| `CARDS_SUPABASE_ANON_KEY` | Supabase anonymous key | - |
### Speech-to-Text (mana-stt)
Used by the unified Mana web app's voice features (Memoro recording, Dreams voice capture, etc).
The browser never talks to mana-stt directly — requests go through the SvelteKit server-side proxy
(`/api/v1/memoro/transcribe`, `/api/v1/dreams/transcribe`) which attaches the API key from
`MANA_STT_API_KEY`. Keep that key out of the browser bundle.
| Variable | Description | Default |
|----------|-------------|---------|
| `STT_URL` | Public mana-stt URL — generates `MANA_STT_URL` for the web app | `https://gpu-stt.mana.how` |
| `MANA_STT_API_KEY` | API key for mana-stt. **Never commit a real value.** | _(empty)_ |
**Where to obtain a key:**
- Production deploy: set `MANA_STT_API_KEY` in the Mac Mini's `.env` (sourced by
`docker-compose.macmini.yml`, line ~1076). The key lives on the Windows GPU box in
`services/mana-stt/.env` under `API_KEYS=<key>:<name>` and is the source of truth.
- Local dev: paste the dev key into your local `apps/mana/apps/web/.env` after running
`pnpm setup:env` (the generator only writes an empty placeholder). Ask in `#mana-dev` or
pull from the team's password manager under `mana-stt → web-key`.
- New dev key: SSH to the Windows GPU box (`ssh mana-gpu`), append a new entry to
`C:\mana\services\mana-stt\.env` `API_KEYS` (format: `<random>:<name>`), restart the
`ManaSTT` scheduled task. Use a fresh key per consumer (`mana-web`, `chat-server`, etc.)
so we can revoke individually.
**Endpoint:** `https://gpu-stt.mana.how` — Cloudflare Tunnel `mana-gpu-server`
Windows GPU box (`192.168.178.11:3020`). Health: `curl https://gpu-stt.mana.how/health`.
## Adding New Variables
### Step 1: Add to `.env.development`

View file

@ -266,6 +266,16 @@ check_service "GPU TTS" "http://192.168.178.11:3022/health" 3
check_service "GPU Image Gen" "http://192.168.178.11:3023/health" 3
check_service "GPU Video Gen" "http://192.168.178.11:3026/health" 3
echo ""
echo "GPU Server (Cloudflare Tunnel):"
# These probes go through the public Cloudflare tunnel rather than the LAN.
# They catch tunnel-side breakage (cloudflared down on Windows, DNS misroute,
# Public Hostname missing) that LAN probes above wouldn't see.
check_service "GPU STT (tunnel)" "https://gpu-stt.mana.how/health" 8
check_service "GPU LLM (tunnel)" "https://gpu-llm.mana.how/health" 8
check_service "GPU TTS (tunnel)" "https://gpu-tts.mana.how/health" 8
check_service "GPU Image Gen (tunnel)" "https://gpu-img.mana.how/health" 8
echo ""
echo "Matrix:"
check_service "Synapse" "http://localhost:4000/health"

View file

@ -176,7 +176,7 @@ const { text } = await response.json();
const formData = new FormData();
formData.append('file', audioBlob, 'recording.webm');
const response = await fetch('https://stt-api.mana.how/transcribe', {
const response = await fetch('https://gpu-stt.mana.how/transcribe', {
method: 'POST',
body: formData,
});