diff --git a/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts b/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts index 4bfee83b3..72c0c7d4e 100644 --- a/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts +++ b/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts @@ -281,16 +281,15 @@ async function runMissionInner( // prior steps in the same round. isParallelSafe: (name) => AI_TOOL_CATALOG_BY_NAME.get(name)?.defaultPolicy === 'auto', // Fold older turns into a compact-summary at 92% of - // maxContextTokens. Same LlmClient + model as the - // planner; one extra LLM call, but only when usage - // actually approaches the ceiling. + // maxContextTokens. compactHistory defaults to + // DEFAULT_COMPACT_MODEL (gemini-2.5-flash-lite) — + // cheaper than the planner's primary model, which + // matters because the compactor fires exactly when + // token spend is highest. compactor: { maxContextTokens: COMPACT_MAX_CTX, compact: async (msgs) => { - const res = await compactHistory(msgs, { - llm: deps.llm, - model: deps.model ?? 'google/gemini-2.5-flash', - }); + const res = await compactHistory(msgs, { llm: deps.llm }); return { messages: res.messages, compactedTurns: res.compactedTurns }; }, }, diff --git a/apps/mana/apps/web/src/lib/modules/companion/engine.ts b/apps/mana/apps/web/src/lib/modules/companion/engine.ts index acdcff3dd..a3693914d 100644 --- a/apps/mana/apps/web/src/lib/modules/companion/engine.ts +++ b/apps/mana/apps/web/src/lib/modules/companion/engine.ts @@ -123,12 +123,14 @@ export async function runCompanionChat( // user-visible intent order in the proposal inbox. isParallelSafe: (name) => AI_TOOL_CATALOG_BY_NAME.get(name)?.defaultPolicy === 'auto', // Fold the middle of messages into a compact-summary at - // 92% of the model's context window. Mirrors the mana-ai - // wiring; one call to the same LLM client, same model. + // 92% of the model's context window. compactHistory + // defaults to DEFAULT_COMPACT_MODEL (gemini-2.5-flash-lite) + // — cheaper than the planner's own model. Summarisation + // doesn't need the same tier as reasoning. compactor: { maxContextTokens: COMPACT_MAX_CTX, compact: async (msgs) => { - const res = await compactHistory(msgs, { llm, model: 'google/gemini-2.5-flash' }); + const res = await compactHistory(msgs, { llm }); return { messages: res.messages, compactedTurns: res.compactedTurns }; }, }, diff --git a/packages/shared-ai/src/index.ts b/packages/shared-ai/src/index.ts index 31f322348..e62d4aaff 100644 --- a/packages/shared-ai/src/index.ts +++ b/packages/shared-ai/src/index.ts @@ -88,6 +88,7 @@ export { compactHistory, COMPACT_SYSTEM_PROMPT, DEFAULT_COMPACT_KEEP_RECENT, + DEFAULT_COMPACT_MODEL, DEFAULT_COMPACT_THRESHOLD, MockLlmClient, parseCompactSummary, diff --git a/packages/shared-ai/src/planner/compact.test.ts b/packages/shared-ai/src/planner/compact.test.ts index cbe858ae4..eab4d1389 100644 --- a/packages/shared-ai/src/planner/compact.test.ts +++ b/packages/shared-ai/src/planner/compact.test.ts @@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest'; import { COMPACT_SYSTEM_PROMPT, DEFAULT_COMPACT_KEEP_RECENT, + DEFAULT_COMPACT_MODEL, DEFAULT_COMPACT_THRESHOLD, compactHistory, parseCompactSummary, @@ -191,6 +192,46 @@ describe('compactHistory', () => { expect(res.usage).toEqual({ promptTokens: 100, completionTokens: 30 }); }); + it('defaults to DEFAULT_COMPACT_MODEL when model is omitted (fast-tier routing)', async () => { + const history = buildHistory(3, 4); + const seenModels: string[] = []; + const capturingLlm = { + async complete(req: { model: string }) { + seenModels.push(req.model); + return { + content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n', + toolCalls: [], + finishReason: 'stop' as const, + }; + }, + }; + + await compactHistory(history, { llm: capturingLlm }); // no explicit model + + expect(seenModels).toHaveLength(1); + expect(seenModels[0]).toBe(DEFAULT_COMPACT_MODEL); + expect(DEFAULT_COMPACT_MODEL).toBe('google/gemini-2.5-flash-lite'); + }); + + it('honours an explicit model override', async () => { + const history = buildHistory(3, 4); + const seenModels: string[] = []; + const capturingLlm = { + async complete(req: { model: string }) { + seenModels.push(req.model); + return { + content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n', + toolCalls: [], + finishReason: 'stop' as const, + }; + }, + }; + + await compactHistory(history, { llm: capturingLlm, model: 'custom/override-model' }); + + expect(seenModels[0]).toBe('custom/override-model'); + }); + it('respects a custom keepRecent value', async () => { const history = buildHistory(5, 6); const llm = new MockLlmClient().enqueueStop('## Goal\n\n## Decisions\n'); diff --git a/packages/shared-ai/src/planner/compact.ts b/packages/shared-ai/src/planner/compact.ts index 57f4acdf4..7e014eaa1 100644 --- a/packages/shared-ai/src/planner/compact.ts +++ b/packages/shared-ai/src/planner/compact.ts @@ -37,6 +37,21 @@ export const DEFAULT_COMPACT_THRESHOLD = 0.92; * should stay intact for coherence. */ export const DEFAULT_COMPACT_KEEP_RECENT = 4; +/** + * Cheap "fast-tier" model the compactor runs on by default. Matches + * Claude Code's pattern of routing utility tasks (summarisation, + * topic detection, session-summary) to Haiku instead of burning the + * primary-tier budget on them. + * + * google/gemini-2.5-flash-lite is ~3–5x cheaper than gemini-2.5-flash + * with near-identical summarisation quality. Consumers that need + * something different (cost policy, offline fallback to Ollama) can + * override per-call via `CompactHistoryOptions.model`. + * + * Format follows mana-llm's `provider/model` convention. + */ +export const DEFAULT_COMPACT_MODEL = 'google/gemini-2.5-flash-lite'; + /** * Decide whether to compact based on token usage against a ceiling. * Returns false on missing inputs so the caller can skip silently when @@ -122,7 +137,11 @@ export function renderCompactSummary(s: CompactSummary): string { export interface CompactHistoryOptions { readonly llm: LlmClient; - readonly model: string; + /** Model to summarise with. Defaults to `DEFAULT_COMPACT_MODEL` + * (gemini-2.5-flash-lite) — cheaper than the primary planner + * model, which is the whole point: summarisation doesn't need + * the same tier as reasoning + tool-calling. */ + readonly model?: string; /** How many most-recent turns to preserve verbatim. Default 4. */ readonly keepRecent?: number; /** Upper bound on compactor-LLM temperature — we want summarisation, @@ -222,7 +241,7 @@ export async function compactHistory( const response = await opts.llm.complete({ messages: compactRequestMessages, tools: [], - model: opts.model, + model: opts.model ?? DEFAULT_COMPACT_MODEL, temperature: opts.temperature ?? 0.2, }); diff --git a/packages/shared-ai/src/planner/index.ts b/packages/shared-ai/src/planner/index.ts index eab08651c..8b36654c3 100644 --- a/packages/shared-ai/src/planner/index.ts +++ b/packages/shared-ai/src/planner/index.ts @@ -13,6 +13,7 @@ export { runPlannerLoop, LOOP_STATE_RECENT_CALLS_WINDOW, PARALLEL_TOOL_BATCH_SIZ export { COMPACT_SYSTEM_PROMPT, DEFAULT_COMPACT_KEEP_RECENT, + DEFAULT_COMPACT_MODEL, DEFAULT_COMPACT_THRESHOLD, compactHistory, parseCompactSummary, diff --git a/services/mana-ai/src/cron/tick.ts b/services/mana-ai/src/cron/tick.ts index 823c55533..94087c0c6 100644 --- a/services/mana-ai/src/cron/tick.ts +++ b/services/mana-ai/src/cron/tick.ts @@ -396,15 +396,19 @@ async function planOneMission( const plannerModel = 'google/gemini-2.5-flash'; // Claude-Code wU2 pattern: fold the middle of messages into a structured - // summary once cumulative tokens cross 92% of maxContextTokens. Uses - // the same LLM + model as the planner itself; later we can route this - // to a cheaper model (Haiku tier) when mana-llm supports it. + // summary once cumulative tokens cross 92% of maxContextTokens. + // + // compactHistory defaults to DEFAULT_COMPACT_MODEL + // (gemini-2.5-flash-lite) — cheaper than the planner's own model. + // Summarisation doesn't need the same reasoning tier as tool-calling, + // and the compactor runs exactly when token spend is highest, so the + // cheaper route saves tokens where they matter. const compactor = config.compactMaxContextTokens > 0 ? { maxContextTokens: config.compactMaxContextTokens, compact: async (msgs: Parameters[0]) => { - const result = await compactHistory(msgs, { llm, model: plannerModel }); + const result = await compactHistory(msgs, { llm }); if (result.compactedTurns > 0) { compactionsTriggeredTotal.inc(); compactedTurnsHistogram.observe(result.compactedTurns);