feat(shared-ai): route compactor to Haiku-tier model by default (M2.5)

compactHistory() now defaults to DEFAULT_COMPACT_MODEL = 'google/gemini-2.5-flash-lite' when the caller doesn't override. Lite is ~3–5x cheaper than gemini-2.5-flash with near-identical summarisation quality — summarisation doesn't need the same tier as reasoning + tool-calling, and the compactor fires exactly when token spend is highest, so the cheaper route saves exactly where it matters. CompactHistoryOptions.model is now optional. All three consumers (mana-ai tick, webapp Companion, webapp Mission runner) drop their explicit gemini-2.5-flash override and let the default apply. This is the pragmatic M2.5: no mana-llm changes. The "tier" abstraction (X-Model-Tier header, env-routed aliases) from the Claude-Code report makes sense only once multiple utility tasks need cheaper routing — topic-detection, classification, command-injection checks. Today only the compactor wants it, and a model constant is the simplest contract that works. 2 new tests (default applied + override honoured). 79 shared-ai tests green, all three consumers type-check clean. One pre-existing unrelated type error in apps/mana/apps/web/src/lib/modules/wardrobe/queries.ts (not touched by this commit). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 20:01:09 +02:00 · 2026-04-23 18:26:50 +02:00 · 2026-04-23 18:26:50 +02:00 · f7536bc0b9
commit f7536bc0b9
parent 2769241de3
7 changed files with 83 additions and 16 deletions
--- a/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts
+++ b/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts
@ -281,16 +281,15 @@ async function runMissionInner(
 				// prior steps in the same round.
 				isParallelSafe: (name) => AI_TOOL_CATALOG_BY_NAME.get(name)?.defaultPolicy === 'auto',
 				// Fold older turns into a compact-summary at 92% of
-				// maxContextTokens. Same LlmClient + model as the
-				// planner; one extra LLM call, but only when usage
-				// actually approaches the ceiling.
+				// maxContextTokens. compactHistory defaults to
+				// DEFAULT_COMPACT_MODEL (gemini-2.5-flash-lite) —
+				// cheaper than the planner's primary model, which
+				// matters because the compactor fires exactly when
+				// token spend is highest.
 				compactor: {
 					maxContextTokens: COMPACT_MAX_CTX,
 					compact: async (msgs) => {
-						const res = await compactHistory(msgs, {
-							llm: deps.llm,
-							model: deps.model ?? 'google/gemini-2.5-flash',
-						});
+						const res = await compactHistory(msgs, { llm: deps.llm });
 						return { messages: res.messages, compactedTurns: res.compactedTurns };
 					},
 				},
--- a/apps/mana/apps/web/src/lib/modules/companion/engine.ts
+++ b/apps/mana/apps/web/src/lib/modules/companion/engine.ts
@ -123,12 +123,14 @@ export async function runCompanionChat(
 				// user-visible intent order in the proposal inbox.
 				isParallelSafe: (name) => AI_TOOL_CATALOG_BY_NAME.get(name)?.defaultPolicy === 'auto',
 				// Fold the middle of messages into a compact-summary at
-				// 92% of the model's context window. Mirrors the mana-ai
-				// wiring; one call to the same LLM client, same model.
+				// 92% of the model's context window. compactHistory
+				// defaults to DEFAULT_COMPACT_MODEL (gemini-2.5-flash-lite)
+				// — cheaper than the planner's own model. Summarisation
+				// doesn't need the same tier as reasoning.
 				compactor: {
 					maxContextTokens: COMPACT_MAX_CTX,
 					compact: async (msgs) => {
-						const res = await compactHistory(msgs, { llm, model: 'google/gemini-2.5-flash' });
+						const res = await compactHistory(msgs, { llm });
 						return { messages: res.messages, compactedTurns: res.compactedTurns };
 					},
 				},
--- a/packages/shared-ai/src/index.ts
+++ b/packages/shared-ai/src/index.ts
@ -88,6 +88,7 @@ export {
 	compactHistory,
 	COMPACT_SYSTEM_PROMPT,
 	DEFAULT_COMPACT_KEEP_RECENT,
+	DEFAULT_COMPACT_MODEL,
 	DEFAULT_COMPACT_THRESHOLD,
 	MockLlmClient,
 	parseCompactSummary,
--- a/packages/shared-ai/src/planner/compact.test.ts
+++ b/packages/shared-ai/src/planner/compact.test.ts
@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest';
 import {
 	COMPACT_SYSTEM_PROMPT,
 	DEFAULT_COMPACT_KEEP_RECENT,
+	DEFAULT_COMPACT_MODEL,
 	DEFAULT_COMPACT_THRESHOLD,
 	compactHistory,
 	parseCompactSummary,
@ -191,6 +192,46 @@ describe('compactHistory', () => {
 		expect(res.usage).toEqual({ promptTokens: 100, completionTokens: 30 });
 	});

+	it('defaults to DEFAULT_COMPACT_MODEL when model is omitted (fast-tier routing)', async () => {
+		const history = buildHistory(3, 4);
+		const seenModels: string[] = [];
+		const capturingLlm = {
+			async complete(req: { model: string }) {
+				seenModels.push(req.model);
+				return {
+					content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n',
+					toolCalls: [],
+					finishReason: 'stop' as const,
+				};
+			},
+		};
+
+		await compactHistory(history, { llm: capturingLlm }); // no explicit model
+
+		expect(seenModels).toHaveLength(1);
+		expect(seenModels[0]).toBe(DEFAULT_COMPACT_MODEL);
+		expect(DEFAULT_COMPACT_MODEL).toBe('google/gemini-2.5-flash-lite');
+	});
+
+	it('honours an explicit model override', async () => {
+		const history = buildHistory(3, 4);
+		const seenModels: string[] = [];
+		const capturingLlm = {
+			async complete(req: { model: string }) {
+				seenModels.push(req.model);
+				return {
+					content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n',
+					toolCalls: [],
+					finishReason: 'stop' as const,
+				};
+			},
+		};
+
+		await compactHistory(history, { llm: capturingLlm, model: 'custom/override-model' });
+
+		expect(seenModels[0]).toBe('custom/override-model');
+	});
+
 	it('respects a custom keepRecent value', async () => {
 		const history = buildHistory(5, 6);
 		const llm = new MockLlmClient().enqueueStop('## Goal\n\n## Decisions\n');
--- a/packages/shared-ai/src/planner/compact.ts
+++ b/packages/shared-ai/src/planner/compact.ts
@ -37,6 +37,21 @@ export const DEFAULT_COMPACT_THRESHOLD = 0.92;
 *  should stay intact for coherence. */
 export const DEFAULT_COMPACT_KEEP_RECENT = 4;

+/**
+ * Cheap "fast-tier" model the compactor runs on by default. Matches
+ * Claude Code's pattern of routing utility tasks (summarisation,
+ * topic detection, session-summary) to Haiku instead of burning the
+ * primary-tier budget on them.
+ *
+ * google/gemini-2.5-flash-lite is ~3–5x cheaper than gemini-2.5-flash
+ * with near-identical summarisation quality. Consumers that need
+ * something different (cost policy, offline fallback to Ollama) can
+ * override per-call via `CompactHistoryOptions.model`.
+ *
+ * Format follows mana-llm's `provider/model` convention.
+ */
+export const DEFAULT_COMPACT_MODEL = 'google/gemini-2.5-flash-lite';
+
 /**
 * Decide whether to compact based on token usage against a ceiling.
 * Returns false on missing inputs so the caller can skip silently when
@ -122,7 +137,11 @@ export function renderCompactSummary(s: CompactSummary): string {

 export interface CompactHistoryOptions {
 	readonly llm: LlmClient;
-	readonly model: string;
+	/** Model to summarise with. Defaults to `DEFAULT_COMPACT_MODEL`
+	 *  (gemini-2.5-flash-lite) — cheaper than the primary planner
+	 *  model, which is the whole point: summarisation doesn't need
+	 *  the same tier as reasoning + tool-calling. */
+	readonly model?: string;
 	/** How many most-recent turns to preserve verbatim. Default 4. */
 	readonly keepRecent?: number;
 	/** Upper bound on compactor-LLM temperature — we want summarisation,
@ -222,7 +241,7 @@ export async function compactHistory(
 	const response = await opts.llm.complete({
 		messages: compactRequestMessages,
 		tools: [],
-		model: opts.model,
+		model: opts.model ?? DEFAULT_COMPACT_MODEL,
 		temperature: opts.temperature ?? 0.2,
 	});

--- a/packages/shared-ai/src/planner/index.ts
+++ b/packages/shared-ai/src/planner/index.ts
@ -13,6 +13,7 @@ export { runPlannerLoop, LOOP_STATE_RECENT_CALLS_WINDOW, PARALLEL_TOOL_BATCH_SIZ
 export {
 	COMPACT_SYSTEM_PROMPT,
 	DEFAULT_COMPACT_KEEP_RECENT,
+	DEFAULT_COMPACT_MODEL,
 	DEFAULT_COMPACT_THRESHOLD,
 	compactHistory,
 	parseCompactSummary,
--- a/services/mana-ai/src/cron/tick.ts
+++ b/services/mana-ai/src/cron/tick.ts
@ -396,15 +396,19 @@ async function planOneMission(
 	const plannerModel = 'google/gemini-2.5-flash';

 	// Claude-Code wU2 pattern: fold the middle of messages into a structured
-	// summary once cumulative tokens cross 92% of maxContextTokens. Uses
-	// the same LLM + model as the planner itself; later we can route this
-	// to a cheaper model (Haiku tier) when mana-llm supports it.
+	// summary once cumulative tokens cross 92% of maxContextTokens.
+	//
+	// compactHistory defaults to DEFAULT_COMPACT_MODEL
+	// (gemini-2.5-flash-lite) — cheaper than the planner's own model.
+	// Summarisation doesn't need the same reasoning tier as tool-calling,
+	// and the compactor runs exactly when token spend is highest, so the
+	// cheaper route saves tokens where they matter.
 	const compactor =
 		config.compactMaxContextTokens > 0
 			? {
 					maxContextTokens: config.compactMaxContextTokens,
 					compact: async (msgs: Parameters<typeof compactHistory>[0]) => {
-						const result = await compactHistory(msgs, { llm, model: plannerModel });
+						const result = await compactHistory(msgs, { llm });
 						if (result.compactedTurns > 0) {
 							compactionsTriggeredTotal.inc();
 							compactedTurnsHistogram.observe(result.compactedTurns);