feat(shared-ai): route compactor to Haiku-tier model by default (M2.5)

compactHistory() now defaults to DEFAULT_COMPACT_MODEL =
'google/gemini-2.5-flash-lite' when the caller doesn't override. Lite
is ~3–5x cheaper than gemini-2.5-flash with near-identical
summarisation quality — summarisation doesn't need the same tier as
reasoning + tool-calling, and the compactor fires exactly when token
spend is highest, so the cheaper route saves exactly where it matters.

CompactHistoryOptions.model is now optional. All three consumers
(mana-ai tick, webapp Companion, webapp Mission runner) drop their
explicit gemini-2.5-flash override and let the default apply.

This is the pragmatic M2.5: no mana-llm changes. The "tier" abstraction
(X-Model-Tier header, env-routed aliases) from the Claude-Code report
makes sense only once multiple utility tasks need cheaper routing —
topic-detection, classification, command-injection checks. Today only
the compactor wants it, and a model constant is the simplest contract
that works.

2 new tests (default applied + override honoured). 79 shared-ai tests
green, all three consumers type-check clean. One pre-existing unrelated
type error in apps/mana/apps/web/src/lib/modules/wardrobe/queries.ts
(not touched by this commit).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-23 18:26:50 +02:00
parent 2769241de3
commit f7536bc0b9
7 changed files with 83 additions and 16 deletions

View file

@ -88,6 +88,7 @@ export {
compactHistory,
COMPACT_SYSTEM_PROMPT,
DEFAULT_COMPACT_KEEP_RECENT,
DEFAULT_COMPACT_MODEL,
DEFAULT_COMPACT_THRESHOLD,
MockLlmClient,
parseCompactSummary,

View file

@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest';
import {
COMPACT_SYSTEM_PROMPT,
DEFAULT_COMPACT_KEEP_RECENT,
DEFAULT_COMPACT_MODEL,
DEFAULT_COMPACT_THRESHOLD,
compactHistory,
parseCompactSummary,
@ -191,6 +192,46 @@ describe('compactHistory', () => {
expect(res.usage).toEqual({ promptTokens: 100, completionTokens: 30 });
});
it('defaults to DEFAULT_COMPACT_MODEL when model is omitted (fast-tier routing)', async () => {
const history = buildHistory(3, 4);
const seenModels: string[] = [];
const capturingLlm = {
async complete(req: { model: string }) {
seenModels.push(req.model);
return {
content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n',
toolCalls: [],
finishReason: 'stop' as const,
};
},
};
await compactHistory(history, { llm: capturingLlm }); // no explicit model
expect(seenModels).toHaveLength(1);
expect(seenModels[0]).toBe(DEFAULT_COMPACT_MODEL);
expect(DEFAULT_COMPACT_MODEL).toBe('google/gemini-2.5-flash-lite');
});
it('honours an explicit model override', async () => {
const history = buildHistory(3, 4);
const seenModels: string[] = [];
const capturingLlm = {
async complete(req: { model: string }) {
seenModels.push(req.model);
return {
content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n',
toolCalls: [],
finishReason: 'stop' as const,
};
},
};
await compactHistory(history, { llm: capturingLlm, model: 'custom/override-model' });
expect(seenModels[0]).toBe('custom/override-model');
});
it('respects a custom keepRecent value', async () => {
const history = buildHistory(5, 6);
const llm = new MockLlmClient().enqueueStop('## Goal\n\n## Decisions\n');

View file

@ -37,6 +37,21 @@ export const DEFAULT_COMPACT_THRESHOLD = 0.92;
* should stay intact for coherence. */
export const DEFAULT_COMPACT_KEEP_RECENT = 4;
/**
* Cheap "fast-tier" model the compactor runs on by default. Matches
* Claude Code's pattern of routing utility tasks (summarisation,
* topic detection, session-summary) to Haiku instead of burning the
* primary-tier budget on them.
*
* google/gemini-2.5-flash-lite is ~35x cheaper than gemini-2.5-flash
* with near-identical summarisation quality. Consumers that need
* something different (cost policy, offline fallback to Ollama) can
* override per-call via `CompactHistoryOptions.model`.
*
* Format follows mana-llm's `provider/model` convention.
*/
export const DEFAULT_COMPACT_MODEL = 'google/gemini-2.5-flash-lite';
/**
* Decide whether to compact based on token usage against a ceiling.
* Returns false on missing inputs so the caller can skip silently when
@ -122,7 +137,11 @@ export function renderCompactSummary(s: CompactSummary): string {
export interface CompactHistoryOptions {
readonly llm: LlmClient;
readonly model: string;
/** Model to summarise with. Defaults to `DEFAULT_COMPACT_MODEL`
* (gemini-2.5-flash-lite) cheaper than the primary planner
* model, which is the whole point: summarisation doesn't need
* the same tier as reasoning + tool-calling. */
readonly model?: string;
/** How many most-recent turns to preserve verbatim. Default 4. */
readonly keepRecent?: number;
/** Upper bound on compactor-LLM temperature we want summarisation,
@ -222,7 +241,7 @@ export async function compactHistory(
const response = await opts.llm.complete({
messages: compactRequestMessages,
tools: [],
model: opts.model,
model: opts.model ?? DEFAULT_COMPACT_MODEL,
temperature: opts.temperature ?? 0.2,
});

View file

@ -13,6 +13,7 @@ export { runPlannerLoop, LOOP_STATE_RECENT_CALLS_WINDOW, PARALLEL_TOOL_BATCH_SIZ
export {
COMPACT_SYSTEM_PROMPT,
DEFAULT_COMPACT_KEEP_RECENT,
DEFAULT_COMPACT_MODEL,
DEFAULT_COMPACT_THRESHOLD,
compactHistory,
parseCompactSummary,