diff --git a/apps/mana/apps/web/src/lib/modules/todo/stores/tasks.svelte.ts b/apps/mana/apps/web/src/lib/modules/todo/stores/tasks.svelte.ts index c7dd7ff55..a63060ffa 100644 --- a/apps/mana/apps/web/src/lib/modules/todo/stores/tasks.svelte.ts +++ b/apps/mana/apps/web/src/lib/modules/todo/stores/tasks.svelte.ts @@ -86,25 +86,29 @@ export const tasksStore = { /** * Create a task from a voice recording. Inserts a placeholder task * immediately so the user sees instant feedback in the list, then - * fills in the real title once mana-stt returns the transcript. - * - * No date/priority parsing yet — that needs an LLM pass and is its - * own follow-up. The user can edit the task inline like any other. + * runs transcription + LLM parsing in the background and updates + * the task with the structured result (title, due date, priority, + * labels). If the LLM step fails or mana-llm is unavailable, the + * server returns the raw transcript as the title and the user gets + * a usable task either way — see /api/v1/voice/parse-task. */ async createFromVoice(blob: Blob, _durationMs: number, language = 'de') { const placeholder = await this.createTask({ title: 'Sprachaufgabe wird transkribiert…' }); - void this.transcribeIntoTask(placeholder.id, blob, language); + void this.transcribeAndParseIntoTask(placeholder.id, blob, language); return placeholder; }, /** - * Upload an audio blob to /api/v1/voice/transcribe and write the - * transcript into an existing task as the new title. On failure, - * surfaces the error inline so the user isn't left with the - * "wird transkribiert…" placeholder forever. + * Two-step pipeline: STT → LLM parse → updateTask. Both steps go + * through server-side proxies (/api/v1/voice/transcribe and + * /api/v1/voice/parse-task) so the browser never sees STT or LLM + * credentials. Failures at either step surface inline as the task + * title so the user isn't left with the "wird transkribiert…" + * placeholder forever. */ - async transcribeIntoTask(taskId: string, blob: Blob, language?: string): Promise { + async transcribeAndParseIntoTask(taskId: string, blob: Blob, language?: string): Promise { try { + // Step 1: speech to text const form = new FormData(); const ext = blob.type.includes('webm') ? '.webm' @@ -114,17 +118,46 @@ export const tasksStore = { form.append('file', blob, `task${ext}`); if (language) form.append('language', language); - const response = await fetch('/api/v1/voice/transcribe', { + const sttResponse = await fetch('/api/v1/voice/transcribe', { method: 'POST', body: form, }); - if (!response.ok) { - const text = await response.text(); - throw new Error(text || `HTTP ${response.status}`); + if (!sttResponse.ok) { + const text = await sttResponse.text(); + throw new Error(text || `HTTP ${sttResponse.status}`); } - const result = (await response.json()) as { text: string }; - const transcript = (result.text ?? '').trim() || 'Sprachaufgabe'; - await this.updateTask(taskId, { title: transcript }); + const sttResult = (await sttResponse.json()) as { text: string }; + const transcript = (sttResult.text ?? '').trim(); + if (!transcript) { + await this.updateTask(taskId, { title: 'Sprachaufgabe' }); + return; + } + + // Step 2: structured extraction. parse-task gracefully falls + // back to { title: transcript, dueDate: null, ... } if mana-llm + // is unreachable, so we don't wrap this in another try/catch. + const parseResponse = await fetch('/api/v1/voice/parse-task', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ transcript, language }), + }); + const parsed = parseResponse.ok + ? ((await parseResponse.json()) as { + title: string; + dueDate: string | null; + priority: 'low' | 'medium' | 'high' | null; + labels: string[]; + }) + : { title: transcript, dueDate: null, priority: null as null, labels: [] as string[] }; + + const update: Record = { title: parsed.title }; + if (parsed.dueDate) update.dueDate = parsed.dueDate; + if (parsed.priority) update.priority = parsed.priority; + // labels are free-text topic hints from the LLM and don't yet + // map to the workspace's tag IDs — leave label wiring to a + // follow-up that does fuzzy matching against existing tags. + + await this.updateTask(taskId, update); } catch (e) { const msg = e instanceof Error ? e.message : String(e); await this.updateTask(taskId, { title: `Sprachaufgabe (Fehler: ${msg})` }); diff --git a/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts b/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts new file mode 100644 index 000000000..93a9ec659 --- /dev/null +++ b/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts @@ -0,0 +1,143 @@ +/** + * POST /api/v1/voice/parse-task + * + * Turn a spoken-task transcript into structured task data via mana-llm. + * Used by the Todo voice quick-add flow: the user speaks a task like + * "Steuererklärung morgen 14 Uhr" and we extract title + due date. + * + * Graceful degradation is the rule here, not the exception. If mana-llm + * is unreachable, mis-configured, or returns garbage JSON, fall back to + * { title: transcript } with no error — the user still gets a usable + * task and can edit it inline. The goal is "voice quick-add never fails + * harder than a typed quick-add", not "voice quick-add only works when + * the LLM is happy". + * + * Request: { transcript: string, language?: string } + * Response: { title, dueDate, priority, labels } — same shape regardless + * of whether the LLM ran or we fell through to the fallback. + */ + +import { json } from '@sveltejs/kit'; +import { env } from '$env/dynamic/private'; +import type { RequestHandler } from './$types'; + +interface ParseResult { + title: string; + dueDate: string | null; // ISO date (YYYY-MM-DD) or full ISO timestamp + priority: 'low' | 'medium' | 'high' | null; + labels: string[]; +} + +const MAX_TRANSCRIPT_CHARS = 1000; +const LLM_TIMEOUT_MS = 8000; +const DEFAULT_MODEL = 'ollama/gemma3:4b'; + +function fallback(transcript: string): ParseResult { + return { title: transcript.trim() || 'Sprachaufgabe', dueDate: null, priority: null, labels: [] }; +} + +function buildPrompt(transcript: string, language: string): string { + const today = new Date().toISOString().slice(0, 10); + const langName = language === 'de' ? 'German' : language === 'en' ? 'English' : language; + return [ + `You are a task parser. The user spoke a task in ${langName}.`, + `Today is ${today}.`, + '', + 'Extract the following fields and return ONLY a JSON object with these exact keys:', + ' - title: short imperative title without filler words (string, required)', + ' - dueDate: ISO date YYYY-MM-DD or null if no date is mentioned', + ' - priority: "low" | "medium" | "high" | null', + ' - labels: array of short topic labels (max 3, lowercase, may be empty)', + '', + 'Rules:', + '- Resolve relative dates ("morgen", "tomorrow", "nächsten Montag") against today.', + '- If only a time is mentioned, assume today.', + '- Never invent details. If unsure, use null / empty array.', + '- Output JSON only, no markdown, no commentary, no code fences.', + '', + `Transcript: ${JSON.stringify(transcript)}`, + ].join('\n'); +} + +function coerce(raw: unknown, transcript: string): ParseResult { + if (!raw || typeof raw !== 'object') return fallback(transcript); + const r = raw as Record; + const title = typeof r.title === 'string' && r.title.trim() ? r.title.trim() : transcript.trim(); + const dueDate = + typeof r.dueDate === 'string' && /^\d{4}-\d{2}-\d{2}/.test(r.dueDate) ? r.dueDate : null; + const priority = + r.priority === 'low' || r.priority === 'medium' || r.priority === 'high' ? r.priority : null; + const labels = Array.isArray(r.labels) + ? r.labels.filter((l): l is string => typeof l === 'string').slice(0, 3) + : []; + return { title, dueDate, priority, labels }; +} + +function extractJson(text: string): unknown { + // Models sometimes wrap JSON in ```json ... ``` even when told not to; + // strip a fenced block if present, then take the first {...} run. + const fenced = text.match(/```(?:json)?\s*([\s\S]*?)```/i); + const body = fenced ? fenced[1] : text; + const start = body.indexOf('{'); + const end = body.lastIndexOf('}'); + if (start === -1 || end === -1 || end < start) return null; + try { + return JSON.parse(body.slice(start, end + 1)); + } catch { + return null; + } +} + +export const POST: RequestHandler = async ({ request }) => { + let body: { transcript?: string; language?: string }; + try { + body = await request.json(); + } catch { + return json(fallback('')); + } + + const transcript = (body.transcript ?? '').slice(0, MAX_TRANSCRIPT_CHARS).trim(); + const language = body.language ?? 'de'; + if (!transcript) return json(fallback('')); + + const llmUrl = env.MANA_LLM_URL || env.PUBLIC_MANA_LLM_URL || 'http://localhost:3025'; + + let response: Response; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), LLM_TIMEOUT_MS); + try { + response = await fetch(`${llmUrl.replace(/\/$/, '')}/v1/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + signal: controller.signal, + body: JSON.stringify({ + model: DEFAULT_MODEL, + stream: false, + temperature: 0, + messages: [ + { role: 'system', content: 'Output JSON only. No prose.' }, + { role: 'user', content: buildPrompt(transcript, language) }, + ], + }), + }); + } catch { + clearTimeout(timer); + return json(fallback(transcript)); + } + clearTimeout(timer); + + if (!response.ok) return json(fallback(transcript)); + + let payload: unknown; + try { + payload = await response.json(); + } catch { + return json(fallback(transcript)); + } + + const content = + (payload as { choices?: Array<{ message?: { content?: string } }> })?.choices?.[0]?.message + ?.content ?? ''; + const parsed = extractJson(content); + return json(coerce(parsed, transcript)); +};