test(voice/parse-task): unit tests for coerce + transcriptMentions guards

The deterministic guards in parse-task's coerce() are the load-bearing defense against gemma3 hallucinating dueDate / priority on bare tasks. The integration tests against the live LLM cover the happy path end-to-end, but they go offline as soon as mana-llm is unreachable — the unit tests cover the guard logic in isolation with synthetic LLM responses, so a regression in the rules is caught even when the LLM itself is dark. 22 cases: - transcriptMentions: substring matching, case-insensitivity, empty pattern list, the German + English date words from the few-shot examples, and the negative cases ("Mülltonnen rausstellen", "Buy milk") that must NOT trigger. - coerce: fallback shape on garbage input, transcript-as-title when the model omits one, time-component stripping ("2026-04-09T14:00:00" → "2026-04-09"), malformed dueDate rejection, the dueDate / priority hallucination guards (drop when the transcript has no trigger word), real-date / real-priority preservation, label filtering (cap at 3, drop non-strings, empty array on non-array input), invalid priority value rejection. Helpers exported solely for the tests via a __test object — the production endpoint goes through buildPrompt + coerce as before. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 23:21:08 +02:00 · 2026-04-08 17:04:52 +02:00 · 2026-04-08 17:04:52 +02:00 · 0119e48edb
commit 0119e48edb
parent 7007140d13
2 changed files with 197 additions and 2 deletions
--- a/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts
+++ b/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/+server.ts
@ -164,12 +164,17 @@ const PRIORITY_TRIGGER_PATTERNS = [
 	'whenever',
 ];

-function transcriptMentions(transcript: string, patterns: string[]): boolean {
+/** Exported for unit tests. */
+export function transcriptMentions(transcript: string, patterns: string[]): boolean {
 	const lower = transcript.toLowerCase();
 	return patterns.some((p) => lower.includes(p));
 }

-function coerce(raw: unknown, transcript: string): ParseResult {
+/** Exported for unit tests. */
+export const __test = { DATE_TRIGGER_PATTERNS, PRIORITY_TRIGGER_PATTERNS };
+
+/** Exported for unit tests. */
+export function coerce(raw: unknown, transcript: string): ParseResult {
 	if (!raw || typeof raw !== 'object') return fallback(transcript);
 	const r = raw as Record<string, unknown>;
 	const title = typeof r.title === 'string' && r.title.trim() ? r.title.trim() : transcript.trim();
--- a/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/coerce.test.ts
+++ b/apps/mana/apps/web/src/routes/api/v1/voice/parse-task/coerce.test.ts
@ -0,0 +1,190 @@
+/**
+ * Unit tests for the deterministic post-processing in parse-task's
+ * coerce() helper. The hallucination guards exist because gemma3:4b
+ * (and even 12b under some prompts) consistently emits dueDate /
+ * priority values for tasks that don't actually mention a date or
+ * urgency word — bare quick-add lines like "Mülltonnen rausstellen"
+ * would otherwise come back with today's date and "low" priority.
+ *
+ * The integration tests against the live LLM cover the happy path
+ * end-to-end. These tests cover the guard logic in isolation, with
+ * synthetic LLM responses, so a regression in the coerce rules is
+ * caught even when the LLM is offline.
+ */
+
+import { describe, it, expect } from 'vitest';
+import { coerce, transcriptMentions, __test } from './+server';
+
+const { DATE_TRIGGER_PATTERNS, PRIORITY_TRIGGER_PATTERNS } = __test;
+
+describe('transcriptMentions', () => {
+	it('returns true on an exact substring hit', () => {
+		expect(transcriptMentions('Anna nächsten Montag anrufen', ['montag'])).toBe(true);
+	});
+
+	it('matches case-insensitively', () => {
+		expect(transcriptMentions('MORGEN um 14 Uhr', ['morgen'])).toBe(true);
+	});
+
+	it('returns false when no pattern hits', () => {
+		expect(transcriptMentions('Mülltonnen rausstellen', ['heut', 'morgen', 'tomorrow'])).toBe(
+			false
+		);
+	});
+
+	it('returns false on empty pattern list', () => {
+		expect(transcriptMentions('something', [])).toBe(false);
+	});
+});
+
+describe('DATE_TRIGGER_PATTERNS sanity', () => {
+	it('catches the German date words used in the few-shot examples', () => {
+		expect(transcriptMentions('Steuererklärung morgen 14 Uhr', DATE_TRIGGER_PATTERNS)).toBe(true);
+		expect(transcriptMentions('Anna nächsten Montag anrufen', DATE_TRIGGER_PATTERNS)).toBe(true);
+		expect(transcriptMentions('Mama am Wochenende besuchen', DATE_TRIGGER_PATTERNS)).toBe(true);
+	});
+
+	it('catches the English date words used in the few-shot examples', () => {
+		expect(transcriptMentions('Call dentist tomorrow at 3pm', DATE_TRIGGER_PATTERNS)).toBe(true);
+	});
+
+	it('does NOT trigger on the bare-task example', () => {
+		expect(transcriptMentions('Mülltonnen rausstellen', DATE_TRIGGER_PATTERNS)).toBe(false);
+		expect(transcriptMentions('Buy milk', DATE_TRIGGER_PATTERNS)).toBe(false);
+	});
+});
+
+describe('PRIORITY_TRIGGER_PATTERNS sanity', () => {
+	it('catches the German urgency words used in the examples', () => {
+		expect(
+			transcriptMentions('Steuererklärung morgen unbedingt erledigen', PRIORITY_TRIGGER_PATTERNS)
+		).toBe(true);
+	});
+
+	it('does NOT trigger on neutral transcripts', () => {
+		expect(transcriptMentions('Mülltonnen rausstellen', PRIORITY_TRIGGER_PATTERNS)).toBe(false);
+		expect(transcriptMentions('Anna nächsten Montag anrufen', PRIORITY_TRIGGER_PATTERNS)).toBe(
+			false
+		);
+	});
+});
+
+describe('coerce', () => {
+	const fallbackResult = {
+		title: 'Sprachaufgabe',
+		dueDate: null,
+		priority: null,
+		labels: [],
+	};
+
+	it('falls back when raw is not an object', () => {
+		expect(coerce(null, '')).toEqual(fallbackResult);
+		expect(coerce('not json', '')).toEqual(fallbackResult);
+		expect(coerce(42, '')).toEqual(fallbackResult);
+	});
+
+	it('uses the transcript as title when the model omits one', () => {
+		expect(coerce({}, 'Mülltonnen rausstellen').title).toBe('Mülltonnen rausstellen');
+	});
+
+	it('passes through a clean structured response untouched', () => {
+		const result = coerce(
+			{
+				title: 'Steuererklärung erledigen',
+				dueDate: '2026-04-09',
+				priority: 'high',
+				labels: ['steuern'],
+			},
+			'Steuererklärung morgen 14 Uhr unbedingt erledigen'
+		);
+		expect(result).toEqual({
+			title: 'Steuererklärung erledigen',
+			dueDate: '2026-04-09',
+			priority: 'high',
+			labels: ['steuern'],
+		});
+	});
+
+	it('strips a time component from dueDate', () => {
+		const result = coerce(
+			{ title: 'X', dueDate: '2026-04-09T14:00:00', priority: null, labels: [] },
+			'X morgen 14 Uhr'
+		);
+		expect(result.dueDate).toBe('2026-04-09');
+	});
+
+	it('rejects a malformed dueDate string', () => {
+		const result = coerce(
+			{ title: 'X', dueDate: 'tomorrow', priority: null, labels: [] },
+			'X morgen'
+		);
+		expect(result.dueDate).toBeNull();
+	});
+
+	it('drops a hallucinated dueDate when transcript has no date words', () => {
+		// gemma3:4b's classic failure: stamps today on a bare task
+		const result = coerce(
+			{ title: 'Mülltonnen rausstellen', dueDate: '2026-04-08', priority: null, labels: ['müll'] },
+			'Mülltonnen rausstellen'
+		);
+		expect(result.dueDate).toBeNull();
+	});
+
+	it('keeps a real dueDate when transcript actually mentions a date', () => {
+		const result = coerce(
+			{
+				title: 'Steuererklärung erledigen',
+				dueDate: '2026-04-09',
+				priority: null,
+				labels: [],
+			},
+			'Steuererklärung morgen erledigen'
+		);
+		expect(result.dueDate).toBe('2026-04-09');
+	});
+
+	it('drops a hallucinated priority when transcript has no urgency words', () => {
+		const result = coerce(
+			{ title: 'Steuererklärung', dueDate: null, priority: 'high', labels: [] },
+			'Steuererklärung machen'
+		);
+		expect(result.priority).toBeNull();
+	});
+
+	it('keeps a real priority when transcript actually mentions urgency', () => {
+		const result = coerce(
+			{ title: 'X', dueDate: null, priority: 'high', labels: [] },
+			'X unbedingt erledigen'
+		);
+		expect(result.priority).toBe('high');
+	});
+
+	it('rejects an invalid priority value', () => {
+		const result = coerce(
+			{ title: 'X', dueDate: null, priority: 'critical', labels: [] },
+			'X unbedingt'
+		);
+		expect(result.priority).toBeNull();
+	});
+
+	it('caps labels at 3 entries', () => {
+		const result = coerce(
+			{ title: 'X', dueDate: null, priority: null, labels: ['a', 'b', 'c', 'd', 'e'] },
+			'X'
+		);
+		expect(result.labels).toEqual(['a', 'b', 'c']);
+	});
+
+	it('drops non-string label entries', () => {
+		const result = coerce(
+			{ title: 'X', dueDate: null, priority: null, labels: ['a', 42, null, 'b'] },
+			'X'
+		);
+		expect(result.labels).toEqual(['a', 'b']);
+	});
+
+	it('returns empty labels array when raw.labels is not an array', () => {
+		const result = coerce({ title: 'X', dueDate: null, priority: null, labels: 'nope' }, 'X');
+		expect(result.labels).toEqual([]);
+	});
+});