feat(notes): voice capture in workbench ListView via shared <VoiceCaptureBar>

Drop a mic into Notes — record, transcribe through the new generic
/api/v1/voice/transcribe proxy (mana-stt), then write the result back
into the placeholder note. The first transcript line becomes the title
when it fits in 80 chars, otherwise a generic 'Sprachnotiz' label.

The inline editor refreshes from the live note while the placeholder
'…' content is still on screen, so a transcript that arrives a moment
after the editor opens shows up automatically without overwriting
anything the user has typed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-08 15:59:42 +02:00
parent e0e801956a
commit 9b3d7c7325
3 changed files with 193 additions and 1 deletions

View file

@ -9,6 +9,7 @@
import type { ViewProps } from '$lib/app-registry';
import { ContextMenu, type ContextMenuItem } from '@mana/shared-ui';
import { PencilSimple, Trash, PushPin } from '@mana/shared-icons';
import VoiceCaptureBar from '$lib/components/voice/VoiceCaptureBar.svelte';
let { navigate, goBack, params }: ViewProps = $props();
@ -31,6 +32,11 @@
startEdit(note);
}
async function handleVoiceComplete(blob: Blob, durationMs: number) {
const note = await notesStore.createFromVoice(blob, durationMs, 'de');
startEdit(note);
}
function startEdit(note: Note) {
if (editingId && editingId !== note.id) saveEdit();
editingId = note.id;
@ -38,6 +44,21 @@
editContent = note.content;
}
// When a voice note's transcript arrives asynchronously while the
// inline editor is open, the underlying Dexie row updates but the
// editor's local copy stays on the "…" placeholder. Sync it back in
// — but ONLY while the editor still shows the placeholder, so we
// never overwrite content the user has already typed.
$effect(() => {
if (!editingId) return;
const live = notes.find((n) => n.id === editingId);
if (!live) return;
if (editContent === '…' && live.content !== '…') {
editTitle = live.title;
editContent = live.content;
}
});
async function saveEdit() {
if (!editingId) return;
await notesStore.updateNote(editingId, {
@ -105,6 +126,14 @@
</script>
<div class="app-view">
<!-- Voice capture -->
<VoiceCaptureBar
idleLabel="Notiz sprechen"
feature="notes-voice-capture"
reason="Notizen werden verschlüsselt gespeichert. Dafür brauchst du ein Mana-Konto."
onComplete={handleVoiceComplete}
/>
<!-- Quick create -->
<form onsubmit={(e) => e.preventDefault()} class="quick-add">
<span class="add-icon">+</span>

View file

@ -16,7 +16,7 @@
import { noteTable } from '../collections';
import { toNote } from '../queries';
import type { LocalNote } from '../types';
import type { LocalNote, Note } from '../types';
import { encryptRecord } from '$lib/data/crypto';
export const notesStore = {
@ -38,6 +38,63 @@ export const notesStore = {
return plaintextSnapshot;
},
/**
* Create a note from a voice recording. Returns the placeholder note
* immediately so the UI can navigate to it; the transcript is filled
* in asynchronously once mana-stt returns. The placeholder title
* 'Sprachnotiz' is intentionally generic once we have a transcript,
* the user can rename inline like any other note.
*/
async createFromVoice(blob: Blob, _durationMs: number, language = 'de'): Promise<Note> {
const note = await this.createNote({ title: 'Sprachnotiz', content: '…' });
// Fire-and-forget: caller has already navigated into edit mode.
void this.transcribeIntoNote(note.id, blob, language);
return note;
},
/**
* Upload an audio blob to /api/v1/voice/transcribe and write the
* transcript into an existing note. On failure, surfaces the error
* inline as the note content so the user isn't left with an empty
* placeholder.
*/
async transcribeIntoNote(noteId: string, blob: Blob, language?: string): Promise<void> {
try {
const form = new FormData();
const ext = blob.type.includes('webm')
? '.webm'
: blob.type.includes('mp4')
? '.m4a'
: '.audio';
form.append('file', blob, `note${ext}`);
if (language) form.append('language', language);
const response = await fetch('/api/v1/voice/transcribe', {
method: 'POST',
body: form,
});
if (!response.ok) {
const text = await response.text();
throw new Error(text || `HTTP ${response.status}`);
}
const result = (await response.json()) as { text: string };
const transcript = (result.text ?? '').trim();
// Use the first line as the title if it's short — keeps the
// note browseable without forcing the user to rename it.
const firstLine = transcript.split('\n')[0]?.trim() ?? '';
const title = firstLine.length > 0 && firstLine.length <= 80 ? firstLine : 'Sprachnotiz';
await this.updateNote(noteId, { title, content: transcript });
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
await this.updateNote(noteId, {
title: 'Sprachnotiz (Fehler)',
content: `Transkription fehlgeschlagen: ${msg}`,
});
}
},
async updateNote(
id: string,
data: Partial<Pick<LocalNote, 'title' | 'content' | 'color' | 'isPinned' | 'isArchived'>>

View file

@ -0,0 +1,106 @@
/**
* POST /api/v1/voice/transcribe
*
* Generic server-side proxy to mana-stt for any module that needs voice
* transcription. The browser uploads an audio Blob; we forward it to
* mana-stt with the server-held API key and return the transcript JSON.
*
* Use this from new modules instead of cloning the per-module endpoints
* (memoro, dreams) those exist for historical reasons and will be
* migrated when convenient.
*
* Request: multipart/form-data with `file` (audio blob) and optional `language`
* Response: { text: string, language: string | null, durationSeconds: number | null }
*/
import { error, json } from '@sveltejs/kit';
import { env } from '$env/dynamic/private';
import type { RequestHandler } from './$types';
const MAX_BYTES = 25 * 1024 * 1024; // 25 MB
function isAcceptableType(mime: string): boolean {
if (!mime) return true; // tolerate missing type — let upstream validate
if (mime === 'application/octet-stream') return true;
return mime.startsWith('audio/') || mime.startsWith('video/'); // m4a often reports video/mp4
}
export const POST: RequestHandler = async ({ request }) => {
const sttUrl = env.MANA_STT_URL;
const apiKey = env.MANA_STT_API_KEY;
if (!sttUrl) {
throw error(503, 'mana-stt is not configured (MANA_STT_URL missing)');
}
let incoming: FormData;
try {
incoming = await request.formData();
} catch {
throw error(400, 'Expected multipart/form-data with a file field');
}
const file = incoming.get('file');
const language = (incoming.get('language') as string | null) ?? null;
if (!(file instanceof Blob)) {
throw error(400, 'Missing file');
}
if (file.size === 0) {
throw error(400, 'Empty audio');
}
if (file.size > MAX_BYTES) {
throw error(413, `Audio too large (max ${MAX_BYTES / 1024 / 1024} MB)`);
}
if (!isAcceptableType(file.type)) {
throw error(415, `Unsupported audio type: ${file.type}`);
}
const ext = mimeToExtension(file.type);
const filename = `voice${ext}`;
const upstream = new FormData();
upstream.append('file', file, filename);
if (language) upstream.append('language', language);
const headers: Record<string, string> = { Accept: 'application/json' };
if (apiKey) headers['X-API-Key'] = apiKey;
let response: Response;
try {
response = await fetch(`${sttUrl.replace(/\/$/, '')}/transcribe`, {
method: 'POST',
headers,
body: upstream,
});
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
throw error(502, `Could not reach mana-stt: ${msg}`);
}
if (!response.ok) {
const text = await response.text();
throw error(response.status, `mana-stt error: ${text || response.statusText}`);
}
const result = (await response.json()) as {
text: string;
language?: string;
duration_seconds?: number;
};
return json({
text: result.text ?? '',
language: result.language ?? null,
durationSeconds: result.duration_seconds ?? null,
});
};
function mimeToExtension(mime: string): string {
if (mime.includes('webm')) return '.webm';
if (mime.includes('ogg')) return '.ogg';
if (mime.includes('mp4') || mime.includes('m4a')) return '.m4a';
if (mime.includes('mpeg')) return '.mp3';
if (mime.includes('wav')) return '.wav';
if (mime.includes('flac')) return '.flac';
return '.webm';
}