feat(shared-ai): route compactor to Haiku-tier model by default (M2.5)

compactHistory() now defaults to DEFAULT_COMPACT_MODEL = 'google/gemini-2.5-flash-lite' when the caller doesn't override. Lite is ~3–5x cheaper than gemini-2.5-flash with near-identical summarisation quality — summarisation doesn't need the same tier as reasoning + tool-calling, and the compactor fires exactly when token spend is highest, so the cheaper route saves exactly where it matters. CompactHistoryOptions.model is now optional. All three consumers (mana-ai tick, webapp Companion, webapp Mission runner) drop their explicit gemini-2.5-flash override and let the default apply. This is the pragmatic M2.5: no mana-llm changes. The "tier" abstraction (X-Model-Tier header, env-routed aliases) from the Claude-Code report makes sense only once multiple utility tasks need cheaper routing — topic-detection, classification, command-injection checks. Today only the compactor wants it, and a model constant is the simplest contract that works. 2 new tests (default applied + override honoured). 79 shared-ai tests green, all three consumers type-check clean. One pre-existing unrelated type error in apps/mana/apps/web/src/lib/modules/wardrobe/queries.ts (not touched by this commit). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 19:01:08 +02:00 · 2026-04-23 18:26:50 +02:00 · 2026-04-23 18:26:50 +02:00 · f7536bc0b9
commit f7536bc0b9
parent 2769241de3
7 changed files with 83 additions and 16 deletions
--- a/packages/shared-ai/src/index.ts
+++ b/packages/shared-ai/src/index.ts
@ -88,6 +88,7 @@ export {
 	compactHistory,
 	COMPACT_SYSTEM_PROMPT,
 	DEFAULT_COMPACT_KEEP_RECENT,
+	DEFAULT_COMPACT_MODEL,
 	DEFAULT_COMPACT_THRESHOLD,
 	MockLlmClient,
 	parseCompactSummary,
--- a/packages/shared-ai/src/planner/compact.test.ts
+++ b/packages/shared-ai/src/planner/compact.test.ts
@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest';
 import {
 	COMPACT_SYSTEM_PROMPT,
 	DEFAULT_COMPACT_KEEP_RECENT,
+	DEFAULT_COMPACT_MODEL,
 	DEFAULT_COMPACT_THRESHOLD,
 	compactHistory,
 	parseCompactSummary,
@ -191,6 +192,46 @@ describe('compactHistory', () => {
 		expect(res.usage).toEqual({ promptTokens: 100, completionTokens: 30 });
 	});

+	it('defaults to DEFAULT_COMPACT_MODEL when model is omitted (fast-tier routing)', async () => {
+		const history = buildHistory(3, 4);
+		const seenModels: string[] = [];
+		const capturingLlm = {
+			async complete(req: { model: string }) {
+				seenModels.push(req.model);
+				return {
+					content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n',
+					toolCalls: [],
+					finishReason: 'stop' as const,
+				};
+			},
+		};
+
+		await compactHistory(history, { llm: capturingLlm }); // no explicit model
+
+		expect(seenModels).toHaveLength(1);
+		expect(seenModels[0]).toBe(DEFAULT_COMPACT_MODEL);
+		expect(DEFAULT_COMPACT_MODEL).toBe('google/gemini-2.5-flash-lite');
+	});
+
+	it('honours an explicit model override', async () => {
+		const history = buildHistory(3, 4);
+		const seenModels: string[] = [];
+		const capturingLlm = {
+			async complete(req: { model: string }) {
+				seenModels.push(req.model);
+				return {
+					content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n',
+					toolCalls: [],
+					finishReason: 'stop' as const,
+				};
+			},
+		};
+
+		await compactHistory(history, { llm: capturingLlm, model: 'custom/override-model' });
+
+		expect(seenModels[0]).toBe('custom/override-model');
+	});
+
 	it('respects a custom keepRecent value', async () => {
 		const history = buildHistory(5, 6);
 		const llm = new MockLlmClient().enqueueStop('## Goal\n\n## Decisions\n');
--- a/packages/shared-ai/src/planner/compact.ts
+++ b/packages/shared-ai/src/planner/compact.ts
@ -37,6 +37,21 @@ export const DEFAULT_COMPACT_THRESHOLD = 0.92;
 *  should stay intact for coherence. */
 export const DEFAULT_COMPACT_KEEP_RECENT = 4;

+/**
+ * Cheap "fast-tier" model the compactor runs on by default. Matches
+ * Claude Code's pattern of routing utility tasks (summarisation,
+ * topic detection, session-summary) to Haiku instead of burning the
+ * primary-tier budget on them.
+ *
+ * google/gemini-2.5-flash-lite is ~3–5x cheaper than gemini-2.5-flash
+ * with near-identical summarisation quality. Consumers that need
+ * something different (cost policy, offline fallback to Ollama) can
+ * override per-call via `CompactHistoryOptions.model`.
+ *
+ * Format follows mana-llm's `provider/model` convention.
+ */
+export const DEFAULT_COMPACT_MODEL = 'google/gemini-2.5-flash-lite';
+
 /**
 * Decide whether to compact based on token usage against a ceiling.
 * Returns false on missing inputs so the caller can skip silently when
@ -122,7 +137,11 @@ export function renderCompactSummary(s: CompactSummary): string {

 export interface CompactHistoryOptions {
 	readonly llm: LlmClient;
-	readonly model: string;
+	/** Model to summarise with. Defaults to `DEFAULT_COMPACT_MODEL`
+	 *  (gemini-2.5-flash-lite) — cheaper than the primary planner
+	 *  model, which is the whole point: summarisation doesn't need
+	 *  the same tier as reasoning + tool-calling. */
+	readonly model?: string;
 	/** How many most-recent turns to preserve verbatim. Default 4. */
 	readonly keepRecent?: number;
 	/** Upper bound on compactor-LLM temperature — we want summarisation,
@ -222,7 +241,7 @@ export async function compactHistory(
 	const response = await opts.llm.complete({
 		messages: compactRequestMessages,
 		tools: [],
-		model: opts.model,
+		model: opts.model ?? DEFAULT_COMPACT_MODEL,
 		temperature: opts.temperature ?? 0.2,
 	});

--- a/packages/shared-ai/src/planner/index.ts
+++ b/packages/shared-ai/src/planner/index.ts
@ -13,6 +13,7 @@ export { runPlannerLoop, LOOP_STATE_RECENT_CALLS_WINDOW, PARALLEL_TOOL_BATCH_SIZ
 export {
 	COMPACT_SYSTEM_PROMPT,
 	DEFAULT_COMPACT_KEEP_RECENT,
+	DEFAULT_COMPACT_MODEL,
 	DEFAULT_COMPACT_THRESHOLD,
 	compactHistory,
 	parseCompactSummary,