feat(todo): LLM-parse spoken tasks into title + dueDate + priority

The previous voice quick-add dumped the whole transcript into the task title — fine for "Steuererklärung" but useless for "Steuererklärung morgen 14 Uhr hoch", which should land as title="Steuererklärung", dueDate=tomorrow, priority="high". New endpoint /api/v1/voice/parse-task posts the transcript to mana-llm (gemma3:4b, temperature 0) with a tight system prompt that asks for strict JSON: { title, dueDate, priority, labels }. The endpoint coerces the response back into the typed shape and falls through to { title: transcript, … } whenever anything goes wrong — mana-llm down, JSON garbled, network timeout. Voice quick-add must never fail harder than typed quick-add, so the fallback path is the rule, not the exception. Labels come back from the LLM as free-text topic hints and don't yet map to the workspace's tag IDs — fuzzy matching against existing tags is a follow-up. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-15 22:19:40 +02:00 · 2026-04-08 16:08:09 +02:00 · 2026-04-08 16:08:09 +02:00 · c32a5a57de
commit c32a5a57de
parent b48c9ff80f
2 changed files with 193 additions and 17 deletions
--- a/apps/mana/apps/web/src/lib/modules/todo/stores/tasks.svelte.ts
+++ b/apps/mana/apps/web/src/lib/modules/todo/stores/tasks.svelte.ts
@ -86,25 +86,29 @@ export const tasksStore = {
 	/**
 	 * Create a task from a voice recording. Inserts a placeholder task
 	 * immediately so the user sees instant feedback in the list, then
-	 * fills in the real title once mana-stt returns the transcript.
-	 *
-	 * No date/priority parsing yet — that needs an LLM pass and is its
-	 * own follow-up. The user can edit the task inline like any other.
+	 * runs transcription + LLM parsing in the background and updates
+	 * the task with the structured result (title, due date, priority,
+	 * labels). If the LLM step fails or mana-llm is unavailable, the
+	 * server returns the raw transcript as the title and the user gets
+	 * a usable task either way — see /api/v1/voice/parse-task.
 	 */
 	async createFromVoice(blob: Blob, _durationMs: number, language = 'de') {
 		const placeholder = await this.createTask({ title: 'Sprachaufgabe wird transkribiert…' });
-		void this.transcribeIntoTask(placeholder.id, blob, language);
+		void this.transcribeAndParseIntoTask(placeholder.id, blob, language);
 		return placeholder;
 	},

 	/**
-	 * Upload an audio blob to /api/v1/voice/transcribe and write the
-	 * transcript into an existing task as the new title. On failure,
-	 * surfaces the error inline so the user isn't left with the
-	 * "wird transkribiert…" placeholder forever.
+	 * Two-step pipeline: STT → LLM parse → updateTask. Both steps go
+	 * through server-side proxies (/api/v1/voice/transcribe and
+	 * /api/v1/voice/parse-task) so the browser never sees STT or LLM
+	 * credentials. Failures at either step surface inline as the task
+	 * title so the user isn't left with the "wird transkribiert…"
+	 * placeholder forever.
 	 */
-	async transcribeIntoTask(taskId: string, blob: Blob, language?: string): Promise<void> {
+	async transcribeAndParseIntoTask(taskId: string, blob: Blob, language?: string): Promise<void> {
 		try {
+			// Step 1: speech to text
 			const form = new FormData();
 			const ext = blob.type.includes('webm')
 				? '.webm'
@ -114,17 +118,46 @@ export const tasksStore = {
 			form.append('file', blob, `task${ext}`);
 			if (language) form.append('language', language);

-			const response = await fetch('/api/v1/voice/transcribe', {
+			const sttResponse = await fetch('/api/v1/voice/transcribe', {
 				method: 'POST',
 				body: form,
 			});
-			if (!response.ok) {
-				const text = await response.text();
-				throw new Error(text || `HTTP ${response.status}`);
+			if (!sttResponse.ok) {
+				const text = await sttResponse.text();
+				throw new Error(text || `HTTP ${sttResponse.status}`);
 			}
-			const result = (await response.json()) as { text: string };
-			const transcript = (result.text ?? '').trim() || 'Sprachaufgabe';
-			await this.updateTask(taskId, { title: transcript });
+			const sttResult = (await sttResponse.json()) as { text: string };
+			const transcript = (sttResult.text ?? '').trim();
+			if (!transcript) {
+				await this.updateTask(taskId, { title: 'Sprachaufgabe' });
+				return;
+			}
+
+			// Step 2: structured extraction. parse-task gracefully falls
+			// back to { title: transcript, dueDate: null, ... } if mana-llm
+			// is unreachable, so we don't wrap this in another try/catch.
+			const parseResponse = await fetch('/api/v1/voice/parse-task', {
+				method: 'POST',
+				headers: { 'Content-Type': 'application/json' },
+				body: JSON.stringify({ transcript, language }),
+			});
+			const parsed = parseResponse.ok
+				? ((await parseResponse.json()) as {
+						title: string;
+						dueDate: string | null;
+						priority: 'low' | 'medium' | 'high' | null;
+						labels: string[];
+					})
+				: { title: transcript, dueDate: null, priority: null as null, labels: [] as string[] };
+
+			const update: Record<string, unknown> = { title: parsed.title };
+			if (parsed.dueDate) update.dueDate = parsed.dueDate;
+			if (parsed.priority) update.priority = parsed.priority;
+			// labels are free-text topic hints from the LLM and don't yet
+			// map to the workspace's tag IDs — leave label wiring to a
+			// follow-up that does fuzzy matching against existing tags.
+
+			await this.updateTask(taskId, update);
 		} catch (e) {
 			const msg = e instanceof Error ? e.message : String(e);
 			await this.updateTask(taskId, { title: `Sprachaufgabe (Fehler: ${msg})` });
--- a/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts
+++ b/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts
@ -0,0 +1,143 @@
+/**
+ * POST /api/v1/voice/parse-task
+ *
+ * Turn a spoken-task transcript into structured task data via mana-llm.
+ * Used by the Todo voice quick-add flow: the user speaks a task like
+ * "Steuererklärung morgen 14 Uhr" and we extract title + due date.
+ *
+ * Graceful degradation is the rule here, not the exception. If mana-llm
+ * is unreachable, mis-configured, or returns garbage JSON, fall back to
+ * { title: transcript } with no error — the user still gets a usable
+ * task and can edit it inline. The goal is "voice quick-add never fails
+ * harder than a typed quick-add", not "voice quick-add only works when
+ * the LLM is happy".
+ *
+ * Request:  { transcript: string, language?: string }
+ * Response: { title, dueDate, priority, labels } — same shape regardless
+ *           of whether the LLM ran or we fell through to the fallback.
+ */
+
+import { json } from '@sveltejs/kit';
+import { env } from '$env/dynamic/private';
+import type { RequestHandler } from './$types';
+
+interface ParseResult {
+	title: string;
+	dueDate: string | null; // ISO date (YYYY-MM-DD) or full ISO timestamp
+	priority: 'low' | 'medium' | 'high' | null;
+	labels: string[];
+}
+
+const MAX_TRANSCRIPT_CHARS = 1000;
+const LLM_TIMEOUT_MS = 8000;
+const DEFAULT_MODEL = 'ollama/gemma3:4b';
+
+function fallback(transcript: string): ParseResult {
+	return { title: transcript.trim() || 'Sprachaufgabe', dueDate: null, priority: null, labels: [] };
+}
+
+function buildPrompt(transcript: string, language: string): string {
+	const today = new Date().toISOString().slice(0, 10);
+	const langName = language === 'de' ? 'German' : language === 'en' ? 'English' : language;
+	return [
+		`You are a task parser. The user spoke a task in ${langName}.`,
+		`Today is ${today}.`,
+		'',
+		'Extract the following fields and return ONLY a JSON object with these exact keys:',
+		'  - title: short imperative title without filler words (string, required)',
+		'  - dueDate: ISO date YYYY-MM-DD or null if no date is mentioned',
+		'  - priority: "low" | "medium" | "high" | null',
+		'  - labels: array of short topic labels (max 3, lowercase, may be empty)',
+		'',
+		'Rules:',
+		'- Resolve relative dates ("morgen", "tomorrow", "nächsten Montag") against today.',
+		'- If only a time is mentioned, assume today.',
+		'- Never invent details. If unsure, use null / empty array.',
+		'- Output JSON only, no markdown, no commentary, no code fences.',
+		'',
+		`Transcript: ${JSON.stringify(transcript)}`,
+	].join('\n');
+}
+
+function coerce(raw: unknown, transcript: string): ParseResult {
+	if (!raw || typeof raw !== 'object') return fallback(transcript);
+	const r = raw as Record<string, unknown>;
+	const title = typeof r.title === 'string' && r.title.trim() ? r.title.trim() : transcript.trim();
+	const dueDate =
+		typeof r.dueDate === 'string' && /^\d{4}-\d{2}-\d{2}/.test(r.dueDate) ? r.dueDate : null;
+	const priority =
+		r.priority === 'low' || r.priority === 'medium' || r.priority === 'high' ? r.priority : null;
+	const labels = Array.isArray(r.labels)
+		? r.labels.filter((l): l is string => typeof l === 'string').slice(0, 3)
+		: [];
+	return { title, dueDate, priority, labels };
+}
+
+function extractJson(text: string): unknown {
+	// Models sometimes wrap JSON in ```json ... ``` even when told not to;
+	// strip a fenced block if present, then take the first {...} run.
+	const fenced = text.match(/```(?:json)?\s*([\s\S]*?)```/i);
+	const body = fenced ? fenced[1] : text;
+	const start = body.indexOf('{');
+	const end = body.lastIndexOf('}');
+	if (start === -1 || end === -1 || end < start) return null;
+	try {
+		return JSON.parse(body.slice(start, end + 1));
+	} catch {
+		return null;
+	}
+}
+
+export const POST: RequestHandler = async ({ request }) => {
+	let body: { transcript?: string; language?: string };
+	try {
+		body = await request.json();
+	} catch {
+		return json(fallback(''));
+	}
+
+	const transcript = (body.transcript ?? '').slice(0, MAX_TRANSCRIPT_CHARS).trim();
+	const language = body.language ?? 'de';
+	if (!transcript) return json(fallback(''));
+
+	const llmUrl = env.MANA_LLM_URL || env.PUBLIC_MANA_LLM_URL || 'http://localhost:3025';
+
+	let response: Response;
+	const controller = new AbortController();
+	const timer = setTimeout(() => controller.abort(), LLM_TIMEOUT_MS);
+	try {
+		response = await fetch(`${llmUrl.replace(/\/$/, '')}/v1/chat/completions`, {
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			signal: controller.signal,
+			body: JSON.stringify({
+				model: DEFAULT_MODEL,
+				stream: false,
+				temperature: 0,
+				messages: [
+					{ role: 'system', content: 'Output JSON only. No prose.' },
+					{ role: 'user', content: buildPrompt(transcript, language) },
+				],
+			}),
+		});
+	} catch {
+		clearTimeout(timer);
+		return json(fallback(transcript));
+	}
+	clearTimeout(timer);
+
+	if (!response.ok) return json(fallback(transcript));
+
+	let payload: unknown;
+	try {
+		payload = await response.json();
+	} catch {
+		return json(fallback(transcript));
+	}
+
+	const content =
+		(payload as { choices?: Array<{ message?: { content?: string } }> })?.choices?.[0]?.message
+			?.content ?? '';
+	const parsed = extractJson(content);
+	return json(coerce(parsed, transcript));
+};