mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-15 22:19:40 +02:00
feat(todo): LLM-parse spoken tasks into title + dueDate + priority
The previous voice quick-add dumped the whole transcript into the task
title — fine for "Steuererklärung" but useless for "Steuererklärung
morgen 14 Uhr hoch", which should land as title="Steuererklärung",
dueDate=tomorrow, priority="high".
New endpoint /api/v1/voice/parse-task posts the transcript to mana-llm
(gemma3:4b, temperature 0) with a tight system prompt that asks for
strict JSON: { title, dueDate, priority, labels }. The endpoint coerces
the response back into the typed shape and falls through to
{ title: transcript, … } whenever anything goes wrong — mana-llm down,
JSON garbled, network timeout. Voice quick-add must never fail harder
than typed quick-add, so the fallback path is the rule, not the
exception.
Labels come back from the LLM as free-text topic hints and don't yet
map to the workspace's tag IDs — fuzzy matching against existing tags
is a follow-up.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b48c9ff80f
commit
c32a5a57de
2 changed files with 193 additions and 17 deletions
|
|
@ -86,25 +86,29 @@ export const tasksStore = {
|
|||
/**
|
||||
* Create a task from a voice recording. Inserts a placeholder task
|
||||
* immediately so the user sees instant feedback in the list, then
|
||||
* fills in the real title once mana-stt returns the transcript.
|
||||
*
|
||||
* No date/priority parsing yet — that needs an LLM pass and is its
|
||||
* own follow-up. The user can edit the task inline like any other.
|
||||
* runs transcription + LLM parsing in the background and updates
|
||||
* the task with the structured result (title, due date, priority,
|
||||
* labels). If the LLM step fails or mana-llm is unavailable, the
|
||||
* server returns the raw transcript as the title and the user gets
|
||||
* a usable task either way — see /api/v1/voice/parse-task.
|
||||
*/
|
||||
async createFromVoice(blob: Blob, _durationMs: number, language = 'de') {
|
||||
const placeholder = await this.createTask({ title: 'Sprachaufgabe wird transkribiert…' });
|
||||
void this.transcribeIntoTask(placeholder.id, blob, language);
|
||||
void this.transcribeAndParseIntoTask(placeholder.id, blob, language);
|
||||
return placeholder;
|
||||
},
|
||||
|
||||
/**
|
||||
* Upload an audio blob to /api/v1/voice/transcribe and write the
|
||||
* transcript into an existing task as the new title. On failure,
|
||||
* surfaces the error inline so the user isn't left with the
|
||||
* "wird transkribiert…" placeholder forever.
|
||||
* Two-step pipeline: STT → LLM parse → updateTask. Both steps go
|
||||
* through server-side proxies (/api/v1/voice/transcribe and
|
||||
* /api/v1/voice/parse-task) so the browser never sees STT or LLM
|
||||
* credentials. Failures at either step surface inline as the task
|
||||
* title so the user isn't left with the "wird transkribiert…"
|
||||
* placeholder forever.
|
||||
*/
|
||||
async transcribeIntoTask(taskId: string, blob: Blob, language?: string): Promise<void> {
|
||||
async transcribeAndParseIntoTask(taskId: string, blob: Blob, language?: string): Promise<void> {
|
||||
try {
|
||||
// Step 1: speech to text
|
||||
const form = new FormData();
|
||||
const ext = blob.type.includes('webm')
|
||||
? '.webm'
|
||||
|
|
@ -114,17 +118,46 @@ export const tasksStore = {
|
|||
form.append('file', blob, `task${ext}`);
|
||||
if (language) form.append('language', language);
|
||||
|
||||
const response = await fetch('/api/v1/voice/transcribe', {
|
||||
const sttResponse = await fetch('/api/v1/voice/transcribe', {
|
||||
method: 'POST',
|
||||
body: form,
|
||||
});
|
||||
if (!response.ok) {
|
||||
const text = await response.text();
|
||||
throw new Error(text || `HTTP ${response.status}`);
|
||||
if (!sttResponse.ok) {
|
||||
const text = await sttResponse.text();
|
||||
throw new Error(text || `HTTP ${sttResponse.status}`);
|
||||
}
|
||||
const result = (await response.json()) as { text: string };
|
||||
const transcript = (result.text ?? '').trim() || 'Sprachaufgabe';
|
||||
await this.updateTask(taskId, { title: transcript });
|
||||
const sttResult = (await sttResponse.json()) as { text: string };
|
||||
const transcript = (sttResult.text ?? '').trim();
|
||||
if (!transcript) {
|
||||
await this.updateTask(taskId, { title: 'Sprachaufgabe' });
|
||||
return;
|
||||
}
|
||||
|
||||
// Step 2: structured extraction. parse-task gracefully falls
|
||||
// back to { title: transcript, dueDate: null, ... } if mana-llm
|
||||
// is unreachable, so we don't wrap this in another try/catch.
|
||||
const parseResponse = await fetch('/api/v1/voice/parse-task', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ transcript, language }),
|
||||
});
|
||||
const parsed = parseResponse.ok
|
||||
? ((await parseResponse.json()) as {
|
||||
title: string;
|
||||
dueDate: string | null;
|
||||
priority: 'low' | 'medium' | 'high' | null;
|
||||
labels: string[];
|
||||
})
|
||||
: { title: transcript, dueDate: null, priority: null as null, labels: [] as string[] };
|
||||
|
||||
const update: Record<string, unknown> = { title: parsed.title };
|
||||
if (parsed.dueDate) update.dueDate = parsed.dueDate;
|
||||
if (parsed.priority) update.priority = parsed.priority;
|
||||
// labels are free-text topic hints from the LLM and don't yet
|
||||
// map to the workspace's tag IDs — leave label wiring to a
|
||||
// follow-up that does fuzzy matching against existing tags.
|
||||
|
||||
await this.updateTask(taskId, update);
|
||||
} catch (e) {
|
||||
const msg = e instanceof Error ? e.message : String(e);
|
||||
await this.updateTask(taskId, { title: `Sprachaufgabe (Fehler: ${msg})` });
|
||||
|
|
|
|||
143
apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts
Normal file
143
apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts
Normal file
|
|
@ -0,0 +1,143 @@
|
|||
/**
|
||||
* POST /api/v1/voice/parse-task
|
||||
*
|
||||
* Turn a spoken-task transcript into structured task data via mana-llm.
|
||||
* Used by the Todo voice quick-add flow: the user speaks a task like
|
||||
* "Steuererklärung morgen 14 Uhr" and we extract title + due date.
|
||||
*
|
||||
* Graceful degradation is the rule here, not the exception. If mana-llm
|
||||
* is unreachable, mis-configured, or returns garbage JSON, fall back to
|
||||
* { title: transcript } with no error — the user still gets a usable
|
||||
* task and can edit it inline. The goal is "voice quick-add never fails
|
||||
* harder than a typed quick-add", not "voice quick-add only works when
|
||||
* the LLM is happy".
|
||||
*
|
||||
* Request: { transcript: string, language?: string }
|
||||
* Response: { title, dueDate, priority, labels } — same shape regardless
|
||||
* of whether the LLM ran or we fell through to the fallback.
|
||||
*/
|
||||
|
||||
import { json } from '@sveltejs/kit';
|
||||
import { env } from '$env/dynamic/private';
|
||||
import type { RequestHandler } from './$types';
|
||||
|
||||
interface ParseResult {
|
||||
title: string;
|
||||
dueDate: string | null; // ISO date (YYYY-MM-DD) or full ISO timestamp
|
||||
priority: 'low' | 'medium' | 'high' | null;
|
||||
labels: string[];
|
||||
}
|
||||
|
||||
const MAX_TRANSCRIPT_CHARS = 1000;
|
||||
const LLM_TIMEOUT_MS = 8000;
|
||||
const DEFAULT_MODEL = 'ollama/gemma3:4b';
|
||||
|
||||
function fallback(transcript: string): ParseResult {
|
||||
return { title: transcript.trim() || 'Sprachaufgabe', dueDate: null, priority: null, labels: [] };
|
||||
}
|
||||
|
||||
function buildPrompt(transcript: string, language: string): string {
|
||||
const today = new Date().toISOString().slice(0, 10);
|
||||
const langName = language === 'de' ? 'German' : language === 'en' ? 'English' : language;
|
||||
return [
|
||||
`You are a task parser. The user spoke a task in ${langName}.`,
|
||||
`Today is ${today}.`,
|
||||
'',
|
||||
'Extract the following fields and return ONLY a JSON object with these exact keys:',
|
||||
' - title: short imperative title without filler words (string, required)',
|
||||
' - dueDate: ISO date YYYY-MM-DD or null if no date is mentioned',
|
||||
' - priority: "low" | "medium" | "high" | null',
|
||||
' - labels: array of short topic labels (max 3, lowercase, may be empty)',
|
||||
'',
|
||||
'Rules:',
|
||||
'- Resolve relative dates ("morgen", "tomorrow", "nächsten Montag") against today.',
|
||||
'- If only a time is mentioned, assume today.',
|
||||
'- Never invent details. If unsure, use null / empty array.',
|
||||
'- Output JSON only, no markdown, no commentary, no code fences.',
|
||||
'',
|
||||
`Transcript: ${JSON.stringify(transcript)}`,
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function coerce(raw: unknown, transcript: string): ParseResult {
|
||||
if (!raw || typeof raw !== 'object') return fallback(transcript);
|
||||
const r = raw as Record<string, unknown>;
|
||||
const title = typeof r.title === 'string' && r.title.trim() ? r.title.trim() : transcript.trim();
|
||||
const dueDate =
|
||||
typeof r.dueDate === 'string' && /^\d{4}-\d{2}-\d{2}/.test(r.dueDate) ? r.dueDate : null;
|
||||
const priority =
|
||||
r.priority === 'low' || r.priority === 'medium' || r.priority === 'high' ? r.priority : null;
|
||||
const labels = Array.isArray(r.labels)
|
||||
? r.labels.filter((l): l is string => typeof l === 'string').slice(0, 3)
|
||||
: [];
|
||||
return { title, dueDate, priority, labels };
|
||||
}
|
||||
|
||||
function extractJson(text: string): unknown {
|
||||
// Models sometimes wrap JSON in ```json ... ``` even when told not to;
|
||||
// strip a fenced block if present, then take the first {...} run.
|
||||
const fenced = text.match(/```(?:json)?\s*([\s\S]*?)```/i);
|
||||
const body = fenced ? fenced[1] : text;
|
||||
const start = body.indexOf('{');
|
||||
const end = body.lastIndexOf('}');
|
||||
if (start === -1 || end === -1 || end < start) return null;
|
||||
try {
|
||||
return JSON.parse(body.slice(start, end + 1));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export const POST: RequestHandler = async ({ request }) => {
|
||||
let body: { transcript?: string; language?: string };
|
||||
try {
|
||||
body = await request.json();
|
||||
} catch {
|
||||
return json(fallback(''));
|
||||
}
|
||||
|
||||
const transcript = (body.transcript ?? '').slice(0, MAX_TRANSCRIPT_CHARS).trim();
|
||||
const language = body.language ?? 'de';
|
||||
if (!transcript) return json(fallback(''));
|
||||
|
||||
const llmUrl = env.MANA_LLM_URL || env.PUBLIC_MANA_LLM_URL || 'http://localhost:3025';
|
||||
|
||||
let response: Response;
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), LLM_TIMEOUT_MS);
|
||||
try {
|
||||
response = await fetch(`${llmUrl.replace(/\/$/, '')}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
signal: controller.signal,
|
||||
body: JSON.stringify({
|
||||
model: DEFAULT_MODEL,
|
||||
stream: false,
|
||||
temperature: 0,
|
||||
messages: [
|
||||
{ role: 'system', content: 'Output JSON only. No prose.' },
|
||||
{ role: 'user', content: buildPrompt(transcript, language) },
|
||||
],
|
||||
}),
|
||||
});
|
||||
} catch {
|
||||
clearTimeout(timer);
|
||||
return json(fallback(transcript));
|
||||
}
|
||||
clearTimeout(timer);
|
||||
|
||||
if (!response.ok) return json(fallback(transcript));
|
||||
|
||||
let payload: unknown;
|
||||
try {
|
||||
payload = await response.json();
|
||||
} catch {
|
||||
return json(fallback(transcript));
|
||||
}
|
||||
|
||||
const content =
|
||||
(payload as { choices?: Array<{ message?: { content?: string } }> })?.choices?.[0]?.message
|
||||
?.content ?? '';
|
||||
const parsed = extractJson(content);
|
||||
return json(coerce(parsed, transcript));
|
||||
};
|
||||
Loading…
Add table
Add a link
Reference in a new issue