diff --git a/packages/shared-ai/src/planner/index.ts b/packages/shared-ai/src/planner/index.ts index 3c36de0e7..ae837a314 100644 --- a/packages/shared-ai/src/planner/index.ts +++ b/packages/shared-ai/src/planner/index.ts @@ -9,7 +9,7 @@ export type { AiPlanInput, AiPlanOutput, AvailableTool, PlannedStep, ResolvedInp // coexist within the atomic PR. export { buildSystemPrompt } from './system-prompt'; export type { SystemPromptInput, SystemPromptOutput } from './system-prompt'; -export { runPlannerLoop } from './loop'; +export { runPlannerLoop, LOOP_STATE_RECENT_CALLS_WINDOW, PARALLEL_TOOL_BATCH_SIZE } from './loop'; export { MockLlmClient } from './mock-llm'; export type { MockLlmTurn } from './mock-llm'; export type { diff --git a/packages/shared-ai/src/planner/loop.test.ts b/packages/shared-ai/src/planner/loop.test.ts index 1d535b6ae..82263a0e9 100644 --- a/packages/shared-ai/src/planner/loop.test.ts +++ b/packages/shared-ai/src/planner/loop.test.ts @@ -396,6 +396,62 @@ describe('runPlannerLoop — reminderChannel', () => { expect(reminders[0].content).toBe('round-2'); }); + it('exposes recentCalls as a sliding window, oldest-first', async () => { + // 7 rounds, each with one tool call, so by round 7 we have 6 prior + // results — the window must cap at LOOP_STATE_RECENT_CALLS_WINDOW = 5. + const llm = new MockLlmClient(); + for (let i = 0; i < 7; i++) { + llm.enqueueToolCalls([{ name: 'list_things', args: { i } }]); + } + llm.enqueueStop(); + + const windowsSeen: Array> = []; + await runPlannerLoop({ + llm, + input: { + systemPrompt: 's', + userPrompt: 'u', + tools, + model: 'm', + maxRounds: 10, + reminderChannel: (state) => { + windowsSeen.push( + state.recentCalls.map((ec) => ({ + i: ec.call.arguments.i, + ok: ec.result.success, + })) + ); + return []; + }, + }, + onToolCall: async (call) => ({ + success: true, + message: `ok-${call.arguments.i}`, + }), + }); + + // Round 1 → window empty + expect(windowsSeen[0]).toEqual([]); + // Round 2 → one prior call + expect(windowsSeen[1]).toEqual([{ i: 0, ok: true }]); + // Round 6 → five prior calls, oldest-first + expect(windowsSeen[5]).toEqual([ + { i: 0, ok: true }, + { i: 1, ok: true }, + { i: 2, ok: true }, + { i: 3, ok: true }, + { i: 4, ok: true }, + ]); + // Round 7 → window slides; i=0 drops off, i=5 is newest + expect(windowsSeen[6]).toEqual([ + { i: 1, ok: true }, + { i: 2, ok: true }, + { i: 3, ok: true }, + { i: 4, ok: true }, + { i: 5, ok: true }, + ]); + }); + it('surfaces loop state — toolCallCount and lastCall — to the channel', async () => { const llm = new MockLlmClient() .enqueueToolCalls([{ name: 'list_things', args: {} }]) diff --git a/packages/shared-ai/src/planner/loop.ts b/packages/shared-ai/src/planner/loop.ts index afd6648f2..747cb247f 100644 --- a/packages/shared-ai/src/planner/loop.ts +++ b/packages/shared-ai/src/planner/loop.ts @@ -69,6 +69,12 @@ export interface LlmClient { // ─── Loop input / result ──────────────────────────────────────────── +/** Sliding-window size for `LoopState.recentCalls`. Capped so the + * reminder channel stays cheap and hint-producers can only reason + * over the last handful of calls, which is what retry-loop-style + * heuristics need. */ +export const LOOP_STATE_RECENT_CALLS_WINDOW = 5; + /** * Transient loop state surfaced to the reminderChannel. The reminder * callback is pure — it reads this snapshot and returns hints; it does @@ -86,6 +92,14 @@ export interface LoopState { /** The most recent ExecutedCall, or undefined in round 1. Handy for * "the last tool failed — warn the LLM" producers. */ readonly lastCall?: ExecutedCall; + /** + * Sliding window of the last N (= `LOOP_STATE_RECENT_CALLS_WINDOW`) + * ExecutedCalls in source order, oldest first. Used by producers + * that need more than the single-last signal — retry-loop detection + * (N consecutive failures), burst detection (many calls to the same + * tool), and similar. Empty in round 1; grows up to the cap. + */ + readonly recentCalls: readonly ExecutedCall[]; } /** @@ -202,6 +216,7 @@ export async function runPlannerLoop(opts: { // — the reminders are ephemeral steering, not conversation. let requestMessages: readonly ChatMessage[] = messages; if (input.reminderChannel) { + const recentCalls = executedCalls.slice(-LOOP_STATE_RECENT_CALLS_WINDOW); const state: LoopState = { round: rounds, toolCallCount: executedCalls.length, @@ -211,6 +226,7 @@ export async function runPlannerLoop(opts: { totalTokens: promptTokens + completionTokens, }, lastCall: executedCalls[executedCalls.length - 1], + recentCalls, }; const reminders = input.reminderChannel(state); if (reminders.length > 0) { diff --git a/services/mana-ai/CLAUDE.md b/services/mana-ai/CLAUDE.md index 1b27ac918..7d083565b 100644 --- a/services/mana-ai/CLAUDE.md +++ b/services/mana-ai/CLAUDE.md @@ -101,8 +101,8 @@ Details zum Deep-Research-Flow: [`docs/reports/gemini-deep-research.md`](../../d Claude-Code-inspirierte Primitive in `runPlannerLoop` (live in `@mana/shared-ai`, siehe [`docs/plans/agent-loop-improvements-m1.md`](../../docs/plans/agent-loop-improvements-m1.md)) und deren Konsumierung hier: -- [x] `reminderChannel` wired via `buildReminderChannel()` in `src/planner/reminders.ts`. Erster Live-Producer: `tokenBudgetReminder` — warnt ab 75% Tagesbudget, eskaliert ab 100% mit "JETZT abschliessen"-Prompt. Round-usage wird on-the-fly drauf addiert, so dass der Warn-Level mitwandert. -- [x] `retryLoopReminder` — Shape fertig, aber dormant: LoopState exponiert heute nur `lastCall`, nicht ein Failure-Window. Aktiviert automatisch sobald shared-ai LoopState um `recentResults` erweitert. +- [x] `reminderChannel` wired via `buildReminderChannel()` in `src/planner/reminders.ts`. Live-Producer 1: `tokenBudgetReminder` — warnt ab 75% Tagesbudget, eskaliert ab 100% mit "JETZT abschliessen"-Prompt. Round-usage wird on-the-fly drauf addiert, so dass der Warn-Level mitwandert. +- [x] `retryLoopReminder` live — feuert ab Round 3 wenn die letzten 2 Tool-Calls beide fehlschlugen. Liest das `recentCalls`-Sliding-Window (5 Einträge, oldest-first) aus `LoopState`. - [x] `POLICY_MODE` env (off/log-only/enforce, default log-only) für die mana-ai-seitige Freitext-Inspection (`detectInjectionMarker`). Rate-Limit und destructive-opt-in sind hier NICHT aktiv — tools werden nur als PlanSteps aufgezeichnet, die echte Enforcement passiert im Webapp-Client. - [ ] Parallel-Reads im Server-Tick haben keinen Effekt, weil `SERVER_TOOLS` per Konstruktion propose-only ist. Könnte relevant werden sobald mana-ai die vollständige tool-registry absorbiert (M4 des Personas-Plans). diff --git a/services/mana-ai/src/planner/reminders.test.ts b/services/mana-ai/src/planner/reminders.test.ts index 1bcabcff3..363cf7a66 100644 --- a/services/mana-ai/src/planner/reminders.test.ts +++ b/services/mana-ai/src/planner/reminders.test.ts @@ -49,10 +49,26 @@ function makeState(overrides: Partial = {}): LoopState { round: 1, toolCallCount: 0, usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 }, + recentCalls: [], ...overrides, }; } +function mkExecutedCall( + success: boolean, + toolName = 'create_thing' +): { + round: number; + call: { id: string; name: string; arguments: Record }; + result: { success: boolean; message: string }; +} { + return { + round: 1, + call: { id: crypto.randomUUID(), name: toolName, arguments: {} }, + result: { success, message: success ? 'ok' : 'boom' }, + }; +} + // ─── tokenBudgetReminder ────────────────────────────────────────── describe('tokenBudgetReminder', () => { @@ -121,21 +137,50 @@ describe('tokenBudgetReminder', () => { describe('retryLoopReminder', () => { it('is silent before round 3', () => { - expect(retryLoopReminder({ round: 2, lastFailures: [true, true] })).toBeNull(); + expect( + retryLoopReminder({ + round: 2, + recentCalls: [mkExecutedCall(false), mkExecutedCall(false)], + }) + ).toBeNull(); }); it('warns when the last 2 calls failed at round >= 3', () => { - const msg = retryLoopReminder({ round: 3, lastFailures: [true, true] }); + const msg = retryLoopReminder({ + round: 3, + recentCalls: [mkExecutedCall(false), mkExecutedCall(false)], + }); expect(msg).not.toBeNull(); expect(msg).toContain('fehlgeschlagen'); }); it('stays silent when only one of the last 2 failed', () => { - expect(retryLoopReminder({ round: 4, lastFailures: [false, true] })).toBeNull(); + expect( + retryLoopReminder({ + round: 4, + recentCalls: [mkExecutedCall(true), mkExecutedCall(false)], + }) + ).toBeNull(); }); - it('stays silent with fewer than 2 failures recorded', () => { - expect(retryLoopReminder({ round: 5, lastFailures: [true] })).toBeNull(); + it('stays silent with fewer than 2 calls recorded', () => { + expect(retryLoopReminder({ round: 5, recentCalls: [mkExecutedCall(false)] })).toBeNull(); + }); + + it('looks only at the TAIL 2 — a flaky run with intermittent success is not a retry loop', () => { + // 5 calls: F, F, F, OK, F → tail-2 is [OK, F] → silent + expect( + retryLoopReminder({ + round: 5, + recentCalls: [ + mkExecutedCall(false), + mkExecutedCall(false), + mkExecutedCall(false), + mkExecutedCall(true), + mkExecutedCall(false), + ], + }) + ).toBeNull(); }); }); @@ -164,6 +209,40 @@ describe('buildReminderChannel', () => { expect(out[0]).toContain('90%'); }); + it('fires retryLoopReminder end-to-end through the channel', () => { + const channel = buildReminderChannel({ + agent: makeAgent({ maxTokensPerDay: 1_000_000 }), // budget silent + mission: makeMission(), + pretickUsage24h: 0, + }); + const out = channel( + makeState({ + round: 4, + recentCalls: [mkExecutedCall(false), mkExecutedCall(false)], + }) + ); + expect(out).toHaveLength(1); + expect(out[0]).toContain('fehlgeschlagen'); + }); + + it('can fire budget + retry together (composition)', () => { + const channel = buildReminderChannel({ + agent: makeAgent({ maxTokensPerDay: 10_000 }), + mission: makeMission(), + pretickUsage24h: 9_000, + }); + const out = channel( + makeState({ + round: 3, + usage: { promptTokens: 500, completionTokens: 500, totalTokens: 1_000 }, + recentCalls: [mkExecutedCall(false), mkExecutedCall(false)], + }) + ); + expect(out).toHaveLength(2); + expect(out[0]).toContain('ausgeschoepft'); // budget first + expect(out[1]).toContain('fehlgeschlagen'); // retry second + }); + it('uses the updated totalTokens each round (re-evaluated)', () => { const channel = buildReminderChannel({ agent: makeAgent({ maxTokensPerDay: 10_000 }), diff --git a/services/mana-ai/src/planner/reminders.ts b/services/mana-ai/src/planner/reminders.ts index 2036d214a..e9e39ccd6 100644 --- a/services/mana-ai/src/planner/reminders.ts +++ b/services/mana-ai/src/planner/reminders.ts @@ -69,17 +69,22 @@ export function tokenBudgetReminder(ctx: ReminderContext, roundUsage: number): s /** * Nudge the planner to end when it is clearly iterating without new * information: 3+ rounds in and the last 2 tool-calls returned - * `success: false`. This is a heuristic guard against infinite re-try - * loops where the LLM keeps calling the same failing tool with slightly - * different arguments. + * `success: false`. Heuristic guard against infinite retry loops where + * the LLM keeps calling the same failing tool with slightly different + * arguments. + * + * Reads the `recentCalls` sliding window from LoopState — the last 5 + * executed calls in oldest-first order. We only look at the tail 2 + * because a run that mixes failures and successes is not a true retry + * loop, it's just flaky tools. */ export function retryLoopReminder(state: { readonly round: number; - readonly lastFailures: readonly boolean[]; + readonly recentCalls: readonly { readonly result: { readonly success: boolean } }[]; }): string | null { if (state.round < 3) return null; - const recent = state.lastFailures.slice(-2); - if (recent.length === 2 && recent.every((f) => f)) { + const tail = state.recentCalls.slice(-2); + if (tail.length === 2 && tail.every((ec) => !ec.result.success)) { return ( `Die letzten 2 Tool-Calls sind fehlgeschlagen. Brich die ` + `Wiederholung ab — formuliere stattdessen einen Summary-Text, ` + @@ -100,19 +105,10 @@ export function retryLoopReminder(state: { */ export function buildReminderChannel(ctx: ReminderContext): ReminderChannel { return (state) => { - const failures: boolean[] = []; - // We don't get the full executedCalls in LoopState (intentional — - // the channel is meant to be cheap), but `lastCall` is exposed. - // For retry-loop detection we'd ideally track the last N; for now - // the single lastCall is enough to skip 2-round miss signals, so - // this producer is effectively dormant until we extend LoopState. - // Left in place so the shape is right for M2 follow-ups. - if (state.lastCall) failures.push(!state.lastCall.result.success); - const out: string[] = []; const budget = tokenBudgetReminder(ctx, state.usage.totalTokens); if (budget) out.push(budget); - const retry = retryLoopReminder({ round: state.round, lastFailures: failures }); + const retry = retryLoopReminder({ round: state.round, recentCalls: state.recentCalls }); if (retry) out.push(retry); return out; };