mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 20:01:09 +02:00
feat(shared-ai): route compactor to Haiku-tier model by default (M2.5)
compactHistory() now defaults to DEFAULT_COMPACT_MODEL = 'google/gemini-2.5-flash-lite' when the caller doesn't override. Lite is ~3–5x cheaper than gemini-2.5-flash with near-identical summarisation quality — summarisation doesn't need the same tier as reasoning + tool-calling, and the compactor fires exactly when token spend is highest, so the cheaper route saves exactly where it matters. CompactHistoryOptions.model is now optional. All three consumers (mana-ai tick, webapp Companion, webapp Mission runner) drop their explicit gemini-2.5-flash override and let the default apply. This is the pragmatic M2.5: no mana-llm changes. The "tier" abstraction (X-Model-Tier header, env-routed aliases) from the Claude-Code report makes sense only once multiple utility tasks need cheaper routing — topic-detection, classification, command-injection checks. Today only the compactor wants it, and a model constant is the simplest contract that works. 2 new tests (default applied + override honoured). 79 shared-ai tests green, all three consumers type-check clean. One pre-existing unrelated type error in apps/mana/apps/web/src/lib/modules/wardrobe/queries.ts (not touched by this commit). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2769241de3
commit
f7536bc0b9
7 changed files with 83 additions and 16 deletions
|
|
@ -281,16 +281,15 @@ async function runMissionInner(
|
|||
// prior steps in the same round.
|
||||
isParallelSafe: (name) => AI_TOOL_CATALOG_BY_NAME.get(name)?.defaultPolicy === 'auto',
|
||||
// Fold older turns into a compact-summary at 92% of
|
||||
// maxContextTokens. Same LlmClient + model as the
|
||||
// planner; one extra LLM call, but only when usage
|
||||
// actually approaches the ceiling.
|
||||
// maxContextTokens. compactHistory defaults to
|
||||
// DEFAULT_COMPACT_MODEL (gemini-2.5-flash-lite) —
|
||||
// cheaper than the planner's primary model, which
|
||||
// matters because the compactor fires exactly when
|
||||
// token spend is highest.
|
||||
compactor: {
|
||||
maxContextTokens: COMPACT_MAX_CTX,
|
||||
compact: async (msgs) => {
|
||||
const res = await compactHistory(msgs, {
|
||||
llm: deps.llm,
|
||||
model: deps.model ?? 'google/gemini-2.5-flash',
|
||||
});
|
||||
const res = await compactHistory(msgs, { llm: deps.llm });
|
||||
return { messages: res.messages, compactedTurns: res.compactedTurns };
|
||||
},
|
||||
},
|
||||
|
|
|
|||
|
|
@ -123,12 +123,14 @@ export async function runCompanionChat(
|
|||
// user-visible intent order in the proposal inbox.
|
||||
isParallelSafe: (name) => AI_TOOL_CATALOG_BY_NAME.get(name)?.defaultPolicy === 'auto',
|
||||
// Fold the middle of messages into a compact-summary at
|
||||
// 92% of the model's context window. Mirrors the mana-ai
|
||||
// wiring; one call to the same LLM client, same model.
|
||||
// 92% of the model's context window. compactHistory
|
||||
// defaults to DEFAULT_COMPACT_MODEL (gemini-2.5-flash-lite)
|
||||
// — cheaper than the planner's own model. Summarisation
|
||||
// doesn't need the same tier as reasoning.
|
||||
compactor: {
|
||||
maxContextTokens: COMPACT_MAX_CTX,
|
||||
compact: async (msgs) => {
|
||||
const res = await compactHistory(msgs, { llm, model: 'google/gemini-2.5-flash' });
|
||||
const res = await compactHistory(msgs, { llm });
|
||||
return { messages: res.messages, compactedTurns: res.compactedTurns };
|
||||
},
|
||||
},
|
||||
|
|
|
|||
|
|
@ -88,6 +88,7 @@ export {
|
|||
compactHistory,
|
||||
COMPACT_SYSTEM_PROMPT,
|
||||
DEFAULT_COMPACT_KEEP_RECENT,
|
||||
DEFAULT_COMPACT_MODEL,
|
||||
DEFAULT_COMPACT_THRESHOLD,
|
||||
MockLlmClient,
|
||||
parseCompactSummary,
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest';
|
|||
import {
|
||||
COMPACT_SYSTEM_PROMPT,
|
||||
DEFAULT_COMPACT_KEEP_RECENT,
|
||||
DEFAULT_COMPACT_MODEL,
|
||||
DEFAULT_COMPACT_THRESHOLD,
|
||||
compactHistory,
|
||||
parseCompactSummary,
|
||||
|
|
@ -191,6 +192,46 @@ describe('compactHistory', () => {
|
|||
expect(res.usage).toEqual({ promptTokens: 100, completionTokens: 30 });
|
||||
});
|
||||
|
||||
it('defaults to DEFAULT_COMPACT_MODEL when model is omitted (fast-tier routing)', async () => {
|
||||
const history = buildHistory(3, 4);
|
||||
const seenModels: string[] = [];
|
||||
const capturingLlm = {
|
||||
async complete(req: { model: string }) {
|
||||
seenModels.push(req.model);
|
||||
return {
|
||||
content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n',
|
||||
toolCalls: [],
|
||||
finishReason: 'stop' as const,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
await compactHistory(history, { llm: capturingLlm }); // no explicit model
|
||||
|
||||
expect(seenModels).toHaveLength(1);
|
||||
expect(seenModels[0]).toBe(DEFAULT_COMPACT_MODEL);
|
||||
expect(DEFAULT_COMPACT_MODEL).toBe('google/gemini-2.5-flash-lite');
|
||||
});
|
||||
|
||||
it('honours an explicit model override', async () => {
|
||||
const history = buildHistory(3, 4);
|
||||
const seenModels: string[] = [];
|
||||
const capturingLlm = {
|
||||
async complete(req: { model: string }) {
|
||||
seenModels.push(req.model);
|
||||
return {
|
||||
content: '## Goal\n\n## Decisions\n\n## Tools Called\n\n## Current Progress\n',
|
||||
toolCalls: [],
|
||||
finishReason: 'stop' as const,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
await compactHistory(history, { llm: capturingLlm, model: 'custom/override-model' });
|
||||
|
||||
expect(seenModels[0]).toBe('custom/override-model');
|
||||
});
|
||||
|
||||
it('respects a custom keepRecent value', async () => {
|
||||
const history = buildHistory(5, 6);
|
||||
const llm = new MockLlmClient().enqueueStop('## Goal\n\n## Decisions\n');
|
||||
|
|
|
|||
|
|
@ -37,6 +37,21 @@ export const DEFAULT_COMPACT_THRESHOLD = 0.92;
|
|||
* should stay intact for coherence. */
|
||||
export const DEFAULT_COMPACT_KEEP_RECENT = 4;
|
||||
|
||||
/**
|
||||
* Cheap "fast-tier" model the compactor runs on by default. Matches
|
||||
* Claude Code's pattern of routing utility tasks (summarisation,
|
||||
* topic detection, session-summary) to Haiku instead of burning the
|
||||
* primary-tier budget on them.
|
||||
*
|
||||
* google/gemini-2.5-flash-lite is ~3–5x cheaper than gemini-2.5-flash
|
||||
* with near-identical summarisation quality. Consumers that need
|
||||
* something different (cost policy, offline fallback to Ollama) can
|
||||
* override per-call via `CompactHistoryOptions.model`.
|
||||
*
|
||||
* Format follows mana-llm's `provider/model` convention.
|
||||
*/
|
||||
export const DEFAULT_COMPACT_MODEL = 'google/gemini-2.5-flash-lite';
|
||||
|
||||
/**
|
||||
* Decide whether to compact based on token usage against a ceiling.
|
||||
* Returns false on missing inputs so the caller can skip silently when
|
||||
|
|
@ -122,7 +137,11 @@ export function renderCompactSummary(s: CompactSummary): string {
|
|||
|
||||
export interface CompactHistoryOptions {
|
||||
readonly llm: LlmClient;
|
||||
readonly model: string;
|
||||
/** Model to summarise with. Defaults to `DEFAULT_COMPACT_MODEL`
|
||||
* (gemini-2.5-flash-lite) — cheaper than the primary planner
|
||||
* model, which is the whole point: summarisation doesn't need
|
||||
* the same tier as reasoning + tool-calling. */
|
||||
readonly model?: string;
|
||||
/** How many most-recent turns to preserve verbatim. Default 4. */
|
||||
readonly keepRecent?: number;
|
||||
/** Upper bound on compactor-LLM temperature — we want summarisation,
|
||||
|
|
@ -222,7 +241,7 @@ export async function compactHistory(
|
|||
const response = await opts.llm.complete({
|
||||
messages: compactRequestMessages,
|
||||
tools: [],
|
||||
model: opts.model,
|
||||
model: opts.model ?? DEFAULT_COMPACT_MODEL,
|
||||
temperature: opts.temperature ?? 0.2,
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ export { runPlannerLoop, LOOP_STATE_RECENT_CALLS_WINDOW, PARALLEL_TOOL_BATCH_SIZ
|
|||
export {
|
||||
COMPACT_SYSTEM_PROMPT,
|
||||
DEFAULT_COMPACT_KEEP_RECENT,
|
||||
DEFAULT_COMPACT_MODEL,
|
||||
DEFAULT_COMPACT_THRESHOLD,
|
||||
compactHistory,
|
||||
parseCompactSummary,
|
||||
|
|
|
|||
|
|
@ -396,15 +396,19 @@ async function planOneMission(
|
|||
const plannerModel = 'google/gemini-2.5-flash';
|
||||
|
||||
// Claude-Code wU2 pattern: fold the middle of messages into a structured
|
||||
// summary once cumulative tokens cross 92% of maxContextTokens. Uses
|
||||
// the same LLM + model as the planner itself; later we can route this
|
||||
// to a cheaper model (Haiku tier) when mana-llm supports it.
|
||||
// summary once cumulative tokens cross 92% of maxContextTokens.
|
||||
//
|
||||
// compactHistory defaults to DEFAULT_COMPACT_MODEL
|
||||
// (gemini-2.5-flash-lite) — cheaper than the planner's own model.
|
||||
// Summarisation doesn't need the same reasoning tier as tool-calling,
|
||||
// and the compactor runs exactly when token spend is highest, so the
|
||||
// cheaper route saves tokens where they matter.
|
||||
const compactor =
|
||||
config.compactMaxContextTokens > 0
|
||||
? {
|
||||
maxContextTokens: config.compactMaxContextTokens,
|
||||
compact: async (msgs: Parameters<typeof compactHistory>[0]) => {
|
||||
const result = await compactHistory(msgs, { llm, model: plannerModel });
|
||||
const result = await compactHistory(msgs, { llm });
|
||||
if (result.compactedTurns > 0) {
|
||||
compactionsTriggeredTotal.inc();
|
||||
compactedTurnsHistogram.observe(result.compactedTurns);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue