From 7007140d136f49d05a27d7d583f5e749eb3fcf86 Mon Sep 17 00:00:00 2001
From: Till JS <tills95@gmail.com>
Date: Wed, 8 Apr 2026 16:59:32 +0200
Subject: [PATCH] fix(voice): switch to gemma3:12b + few-shot prompt for
 parse-task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related changes that fall out of real end-to-end testing against
the now-working local mana-llm.

1. Default model bumped from gemma3:4b to gemma3:12b for both
   parse-task and parse-habit. The 4b model gets weekday math
   off-by-one ("nächsten Montag" from a Wednesday → 2026-04-14
   instead of 2026-04-13), aggressively shortens titles ("Anna
   anrufen" → "Anrufen"), and frequently paraphrases habit names
   instead of copying verbatim ("Joggen" instead of "Laufen") which
   the verbatim-validation in coerce drops, costing an LLM round-trip
   for nothing. The 12b variant is roughly 10% slower for these
   tiny prompts (~1.1s vs ~1.0s on the GPU box) so the accuracy
   win is essentially free.

2. parse-task prompt rewritten as few-shot. Pure rule descriptions
   were *worse* than simple examples — the long "Rules — read
   carefully" section in the previous prompt actually made the model
   compute next Monday as 2026-04-14 even though a direct "what date
   is next Monday?" prompt to the same model returned 2026-04-13.
   The detailed rules were also priming the model to over-shorten
   titles and over-eagerly tag filler words. Five worked examples
   (including the previously-failing "Anna nächsten Montag anrufen"
   case) plus one novel case ("Mama am Wochenende besuchen") all
   come back correct now, including for the novel one.

The deterministic guards in coerce() are kept as a backstop for the
day the GPU box swaps in a weaker model — they're cheap and don't
hurt the happy path.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../api/v1/voice/parse-habit/+server.ts       |  8 ++-
 .../routes/api/v1/voice/parse-task/+server.ts | 70 +++++++++++--------
 2 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/apps/mana/apps/web/src/routes/api/v1/voice/parse-habit/+server.ts b/apps/mana/apps/web/src/routes/api/v1/voice/parse-habit/+server.ts
index d0c4c3217..ca3670673 100644
--- a/apps/mana/apps/web/src/routes/api/v1/voice/parse-habit/+server.ts
+++ b/apps/mana/apps/web/src/routes/api/v1/voice/parse-habit/+server.ts
@@ -29,7 +29,13 @@ import type { RequestHandler } from './$types';
 const MAX_TRANSCRIPT_CHARS = 500;
 const MAX_HABITS = 50;
 const LLM_TIMEOUT_MS = 8000;
-const DEFAULT_MODEL = 'ollama/gemma3:4b';
+// gemma3:12b is more consistent than 4b at the "pick from this list,
+// don't paraphrase" instruction — 4b sometimes returns "Joggen" when
+// "Laufen" was in the list, which the verbatim-validation in coerce
+// then drops, costing an LLM round-trip for nothing. The accuracy
+// win matters more here than for parse-task because parse-habit only
+// runs at all when the cheap client-side substring fast path missed.
+const DEFAULT_MODEL = 'ollama/gemma3:12b';
 
 interface ParseResult {
 	match: string | null;
diff --git a/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts b/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts
index 0663d21a2..4e70bf951 100644
--- a/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts
+++ b/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts
@@ -30,46 +30,56 @@ interface ParseResult {
 
 const MAX_TRANSCRIPT_CHARS = 1000;
 const LLM_TIMEOUT_MS = 8000;
-const DEFAULT_MODEL = 'ollama/gemma3:4b';
+// gemma3:12b consistently nails relative date math ("nächsten Montag"
+// from a Wednesday → next Monday's date) and respects "null when
+// absent" for both dueDate and priority. gemma3:4b gets weekday math
+// off-by-one and stamps today's date on every bare task. The 12b
+// model is only ~10% slower in practice on the GPU box (~1.1s vs
+// ~1.0s for these tiny prompts) so the accuracy win is essentially
+// free. The deterministic guards in coerce() are still kept as a
+// safety net in case the GPU box swaps in a weaker model.
+const DEFAULT_MODEL = 'ollama/gemma3:12b';
 
 function fallback(transcript: string): ParseResult {
 	return { title: transcript.trim() || 'Sprachaufgabe', dueDate: null, priority: null, labels: [] };
 }
 
 function buildPrompt(transcript: string, language: string): string {
-	const today = new Date().toISOString().slice(0, 10);
+	const now = new Date();
+	const today = now.toISOString().slice(0, 10);
+	const weekday = now.toLocaleDateString('en-US', { weekday: 'long' });
 	const langName = language === 'de' ? 'German' : language === 'en' ? 'English' : language;
+	// Few-shot prompt. Pure rule descriptions made gemma3:12b drop
+	// subjects from titles ("Anna anrufen" → "Anrufen") and miscount
+	// weekdays (off-by-one for "nächsten Montag"). Showing the model
+	// what good output looks like for the exact failure modes works
+	// where prose instructions don't. The deterministic guards in
+	// coerce() are still kept as a backstop.
 	return [
-		`You are a task parser. The user spoke a task in ${langName}.`,
-		`Today is ${today}.`,
+		`You parse spoken ${langName} tasks into JSON. Today is ${today} (${weekday}).`,
 		'',
-		'Extract the following fields and return ONLY a JSON object with these exact keys:',
-		'  - title: short imperative title without filler words (string, required)',
-		'  - dueDate: ISO date YYYY-MM-DD',
-		'  - priority: "low" | "medium" | "high"',
-		'  - labels: array of short topic labels (max 3, lowercase)',
+		'Output ONLY a JSON object. No code fences. Keys:',
+		'  title    — keep the full subject, just drop filler words',
+		'  dueDate  — YYYY-MM-DD, or null if no date is mentioned',
+		'  priority — "low" | "medium" | "high", or null if not mentioned',
+		'  labels   — array of short topic words from the transcript (may be empty)',
 		'',
-		'Rules — read carefully, the model often gets these wrong:',
-		'- dueDate: ONLY set this when the transcript explicitly mentions a',
-		'  date, weekday, or relative time word ("morgen", "tomorrow",',
-		'  "nächsten Montag", "heute Abend", "in zwei Wochen"). For a bare',
-		'  task like "Mülltonnen rausstellen" with no time at all, dueDate',
-		'  MUST be null. Never default to today just because the task feels',
-		'  like a today-thing.',
-		'- priority: ONLY set this when the transcript uses urgency or',
-		'  importance words ("dringend", "wichtig", "unbedingt", "asap",',
-		'  "low priority", "kann warten"). For a neutral task, priority',
-		'  MUST be null. Never guess from the topic.',
-		'- labels: ONLY include labels that come directly from concrete',
-		'  topic words in the transcript. For "Mülltonnen rausstellen",',
-		'  "müll" is fine but "haushalt" is a stretch — when in doubt,',
-		'  empty array. Max 3 labels, single words preferred.',
-		'- Resolve relative dates against today for the dueDate field.',
-		'- If only a time is mentioned (e.g. "um 14 Uhr"), assume today.',
-		'- title: a short imperative ("Steuererklärung machen", not',
-		'  "Erinnere mich an die Steuererklärung").',
-		'- Output JSON only, no markdown, no commentary, no code fences.',
-		'- Use null (literal, not the string "null") for absent fields.',
+		'Examples (assume today is 2026-04-08, a Wednesday):',
+		'',
+		'Transcript: "Mülltonnen rausstellen"',
+		'{"title":"Mülltonnen rausstellen","dueDate":null,"priority":null,"labels":["müll"]}',
+		'',
+		'Transcript: "Steuererklärung morgen 14 Uhr unbedingt erledigen"',
+		'{"title":"Steuererklärung erledigen","dueDate":"2026-04-09","priority":"high","labels":["steuern"]}',
+		'',
+		'Transcript: "Anna nächsten Montag anrufen"',
+		'{"title":"Anna anrufen","dueDate":"2026-04-13","priority":null,"labels":["anruf"]}',
+		'',
+		'Transcript: "Buy milk"',
+		'{"title":"Buy milk","dueDate":null,"priority":null,"labels":["grocery"]}',
+		'',
+		'Transcript: "Call dentist tomorrow at 3pm"',
+		'{"title":"Call dentist","dueDate":"2026-04-09","priority":null,"labels":["dentist"]}',
 		'',
 		`Transcript: ${JSON.stringify(transcript)}`,
 	].join('\n');