test(voice/parse-task): unit tests for coerce + transcriptMentions guards

The deterministic guards in parse-task's coerce() are the load-bearing
defense against gemma3 hallucinating dueDate / priority on bare tasks.
The integration tests against the live LLM cover the happy path
end-to-end, but they go offline as soon as mana-llm is unreachable —
the unit tests cover the guard logic in isolation with synthetic LLM
responses, so a regression in the rules is caught even when the LLM
itself is dark.

22 cases:

- transcriptMentions: substring matching, case-insensitivity, empty
  pattern list, the German + English date words from the few-shot
  examples, and the negative cases ("Mülltonnen rausstellen",
  "Buy milk") that must NOT trigger.

- coerce: fallback shape on garbage input, transcript-as-title when
  the model omits one, time-component stripping ("2026-04-09T14:00:00"
  → "2026-04-09"), malformed dueDate rejection, the dueDate /
  priority hallucination guards (drop when the transcript has no
  trigger word), real-date / real-priority preservation, label
  filtering (cap at 3, drop non-strings, empty array on non-array
  input), invalid priority value rejection.

Helpers exported solely for the tests via a __test object — the
production endpoint goes through buildPrompt + coerce as before.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-08 17:04:52 +02:00
parent 7007140d13
commit 0119e48edb
2 changed files with 197 additions and 2 deletions

View file

@ -164,12 +164,17 @@ const PRIORITY_TRIGGER_PATTERNS = [
'whenever',
];
function transcriptMentions(transcript: string, patterns: string[]): boolean {
/** Exported for unit tests. */
export function transcriptMentions(transcript: string, patterns: string[]): boolean {
const lower = transcript.toLowerCase();
return patterns.some((p) => lower.includes(p));
}
function coerce(raw: unknown, transcript: string): ParseResult {
/** Exported for unit tests. */
export const __test = { DATE_TRIGGER_PATTERNS, PRIORITY_TRIGGER_PATTERNS };
/** Exported for unit tests. */
export function coerce(raw: unknown, transcript: string): ParseResult {
if (!raw || typeof raw !== 'object') return fallback(transcript);
const r = raw as Record<string, unknown>;
const title = typeof r.title === 'string' && r.title.trim() ? r.title.trim() : transcript.trim();

View file

@ -0,0 +1,190 @@
/**
* Unit tests for the deterministic post-processing in parse-task's
* coerce() helper. The hallucination guards exist because gemma3:4b
* (and even 12b under some prompts) consistently emits dueDate /
* priority values for tasks that don't actually mention a date or
* urgency word bare quick-add lines like "Mülltonnen rausstellen"
* would otherwise come back with today's date and "low" priority.
*
* The integration tests against the live LLM cover the happy path
* end-to-end. These tests cover the guard logic in isolation, with
* synthetic LLM responses, so a regression in the coerce rules is
* caught even when the LLM is offline.
*/
import { describe, it, expect } from 'vitest';
import { coerce, transcriptMentions, __test } from './+server';
const { DATE_TRIGGER_PATTERNS, PRIORITY_TRIGGER_PATTERNS } = __test;
describe('transcriptMentions', () => {
it('returns true on an exact substring hit', () => {
expect(transcriptMentions('Anna nächsten Montag anrufen', ['montag'])).toBe(true);
});
it('matches case-insensitively', () => {
expect(transcriptMentions('MORGEN um 14 Uhr', ['morgen'])).toBe(true);
});
it('returns false when no pattern hits', () => {
expect(transcriptMentions('Mülltonnen rausstellen', ['heut', 'morgen', 'tomorrow'])).toBe(
false
);
});
it('returns false on empty pattern list', () => {
expect(transcriptMentions('something', [])).toBe(false);
});
});
describe('DATE_TRIGGER_PATTERNS sanity', () => {
it('catches the German date words used in the few-shot examples', () => {
expect(transcriptMentions('Steuererklärung morgen 14 Uhr', DATE_TRIGGER_PATTERNS)).toBe(true);
expect(transcriptMentions('Anna nächsten Montag anrufen', DATE_TRIGGER_PATTERNS)).toBe(true);
expect(transcriptMentions('Mama am Wochenende besuchen', DATE_TRIGGER_PATTERNS)).toBe(true);
});
it('catches the English date words used in the few-shot examples', () => {
expect(transcriptMentions('Call dentist tomorrow at 3pm', DATE_TRIGGER_PATTERNS)).toBe(true);
});
it('does NOT trigger on the bare-task example', () => {
expect(transcriptMentions('Mülltonnen rausstellen', DATE_TRIGGER_PATTERNS)).toBe(false);
expect(transcriptMentions('Buy milk', DATE_TRIGGER_PATTERNS)).toBe(false);
});
});
describe('PRIORITY_TRIGGER_PATTERNS sanity', () => {
it('catches the German urgency words used in the examples', () => {
expect(
transcriptMentions('Steuererklärung morgen unbedingt erledigen', PRIORITY_TRIGGER_PATTERNS)
).toBe(true);
});
it('does NOT trigger on neutral transcripts', () => {
expect(transcriptMentions('Mülltonnen rausstellen', PRIORITY_TRIGGER_PATTERNS)).toBe(false);
expect(transcriptMentions('Anna nächsten Montag anrufen', PRIORITY_TRIGGER_PATTERNS)).toBe(
false
);
});
});
describe('coerce', () => {
const fallbackResult = {
title: 'Sprachaufgabe',
dueDate: null,
priority: null,
labels: [],
};
it('falls back when raw is not an object', () => {
expect(coerce(null, '')).toEqual(fallbackResult);
expect(coerce('not json', '')).toEqual(fallbackResult);
expect(coerce(42, '')).toEqual(fallbackResult);
});
it('uses the transcript as title when the model omits one', () => {
expect(coerce({}, 'Mülltonnen rausstellen').title).toBe('Mülltonnen rausstellen');
});
it('passes through a clean structured response untouched', () => {
const result = coerce(
{
title: 'Steuererklärung erledigen',
dueDate: '2026-04-09',
priority: 'high',
labels: ['steuern'],
},
'Steuererklärung morgen 14 Uhr unbedingt erledigen'
);
expect(result).toEqual({
title: 'Steuererklärung erledigen',
dueDate: '2026-04-09',
priority: 'high',
labels: ['steuern'],
});
});
it('strips a time component from dueDate', () => {
const result = coerce(
{ title: 'X', dueDate: '2026-04-09T14:00:00', priority: null, labels: [] },
'X morgen 14 Uhr'
);
expect(result.dueDate).toBe('2026-04-09');
});
it('rejects a malformed dueDate string', () => {
const result = coerce(
{ title: 'X', dueDate: 'tomorrow', priority: null, labels: [] },
'X morgen'
);
expect(result.dueDate).toBeNull();
});
it('drops a hallucinated dueDate when transcript has no date words', () => {
// gemma3:4b's classic failure: stamps today on a bare task
const result = coerce(
{ title: 'Mülltonnen rausstellen', dueDate: '2026-04-08', priority: null, labels: ['müll'] },
'Mülltonnen rausstellen'
);
expect(result.dueDate).toBeNull();
});
it('keeps a real dueDate when transcript actually mentions a date', () => {
const result = coerce(
{
title: 'Steuererklärung erledigen',
dueDate: '2026-04-09',
priority: null,
labels: [],
},
'Steuererklärung morgen erledigen'
);
expect(result.dueDate).toBe('2026-04-09');
});
it('drops a hallucinated priority when transcript has no urgency words', () => {
const result = coerce(
{ title: 'Steuererklärung', dueDate: null, priority: 'high', labels: [] },
'Steuererklärung machen'
);
expect(result.priority).toBeNull();
});
it('keeps a real priority when transcript actually mentions urgency', () => {
const result = coerce(
{ title: 'X', dueDate: null, priority: 'high', labels: [] },
'X unbedingt erledigen'
);
expect(result.priority).toBe('high');
});
it('rejects an invalid priority value', () => {
const result = coerce(
{ title: 'X', dueDate: null, priority: 'critical', labels: [] },
'X unbedingt'
);
expect(result.priority).toBeNull();
});
it('caps labels at 3 entries', () => {
const result = coerce(
{ title: 'X', dueDate: null, priority: null, labels: ['a', 'b', 'c', 'd', 'e'] },
'X'
);
expect(result.labels).toEqual(['a', 'b', 'c']);
});
it('drops non-string label entries', () => {
const result = coerce(
{ title: 'X', dueDate: null, priority: null, labels: ['a', 42, null, 'b'] },
'X'
);
expect(result.labels).toEqual(['a', 'b']);
});
it('returns empty labels array when raw.labels is not an array', () => {
const result = coerce({ title: 'X', dueDate: null, priority: null, labels: 'nope' }, 'X');
expect(result.labels).toEqual([]);
});
});