managarten/packages/shared-ai/src/planner/loop.test.ts
Till JS 72f7978ed4 feat(agent-loop): expose compactionsDone + compactedReminder producer
Closes the loop on M2: when the compactor fires, the LLM needs to know
it's now seeing a <compact-summary> instead of raw turns so it
doesn't waste a turn asking about lost details or re-executing tools
whose responses are gone.

shared-ai:
  - LoopState grows `compactionsDone: number` (cap-1 by current loop
    policy, but shape kept as count for future multi-compact cycles).
  - runPlannerLoop populates it on each reminder-channel call. New
    loop test asserts [0, 1] sequence: round 1 before compaction,
    round 2 after.

mana-ai:
  - New producer `compactedReminder` — fires severity=info when
    compactionsDone >= 1, wrapped in a German one-liner ("frag nicht
    nach verlorenen Details").
  - Injected FIRST in buildReminderChannel so the LLM frames the rest
    of the round with "I'm looking at a summary" context. Metric
    surface stays `{producer='compacted', severity='info'}`.

4 new reminder tests (3 pure producer + 1 composition-ordering) +
1 loop-wiring test. 77 shared-ai, 20 reminders.test.ts — green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 15:36:21 +02:00

733 lines
20 KiB
TypeScript

import { describe, expect, it, vi } from 'vitest';
import { runPlannerLoop, type ToolCallRequest, type ToolResult } from './loop';
import { MockLlmClient } from './mock-llm';
import type { ToolSchema } from '../tools/schemas';
const tools: ToolSchema[] = [
{
name: 'list_things',
module: 'test',
description: 'list things',
defaultPolicy: 'auto',
parameters: [],
},
{
name: 'create_thing',
module: 'test',
description: 'create a thing',
defaultPolicy: 'propose',
parameters: [{ name: 'title', type: 'string', description: 'title', required: true }],
},
];
describe('runPlannerLoop', () => {
it('stops immediately when the LLM emits no tool_calls', async () => {
const llm = new MockLlmClient().enqueueStop('done');
const onToolCall = vi.fn();
const result = await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'test/model',
},
onToolCall,
});
expect(result.rounds).toBe(1);
expect(result.executedCalls).toHaveLength(0);
expect(result.summary).toBe('done');
expect(result.stopReason).toBe('assistant-stop');
expect(onToolCall).not.toHaveBeenCalled();
});
it('executes a single tool call and feeds the result back', async () => {
const llm = new MockLlmClient()
.enqueueToolCalls([{ name: 'list_things', args: {} }])
.enqueueStop('all done');
const onToolCall = vi.fn(
async (_call: ToolCallRequest): Promise<ToolResult> => ({
success: true,
data: ['a', 'b'],
message: '2 things',
})
);
const result = await runPlannerLoop({
llm,
input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
onToolCall,
});
expect(result.rounds).toBe(2);
expect(result.executedCalls).toHaveLength(1);
expect(result.executedCalls[0].call.name).toBe('list_things');
expect(result.summary).toBe('all done');
expect(result.stopReason).toBe('assistant-stop');
// Second LLM call must have seen the tool result in its messages.
expect(llm.calls[1].messages).toHaveLength(4); // system + user + assistant + tool
const toolMsg = llm.calls[1].messages[3];
expect(toolMsg.role).toBe('tool');
expect(toolMsg.content).toContain('2 things');
});
it('executes parallel tool calls sequentially', async () => {
const llm = new MockLlmClient()
.enqueueToolCalls([
{ name: 'create_thing', args: { title: 'a' } },
{ name: 'create_thing', args: { title: 'b' } },
{ name: 'create_thing', args: { title: 'c' } },
])
.enqueueStop();
const executedInOrder: string[] = [];
const onToolCall = async (call: ToolCallRequest): Promise<ToolResult> => {
executedInOrder.push(call.arguments.title as string);
return { success: true, message: 'ok' };
};
const result = await runPlannerLoop({
llm,
input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
onToolCall,
});
expect(executedInOrder).toEqual(['a', 'b', 'c']);
expect(result.executedCalls).toHaveLength(3);
});
it('propagates tool failures as tool-messages (LLM can react)', async () => {
const llm = new MockLlmClient()
.enqueueToolCalls([{ name: 'list_things', args: {} }])
.enqueueStop('ack');
const onToolCall = async (): Promise<ToolResult> => ({
success: false,
message: 'db locked',
});
const result = await runPlannerLoop({
llm,
input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
onToolCall,
});
const toolMsg = llm.calls[1].messages[3];
expect(toolMsg.content).toContain('db locked');
expect(toolMsg.content).toContain('"success":false');
expect(result.executedCalls[0].result.success).toBe(false);
});
it('honours the maxRounds ceiling', async () => {
const llm = new MockLlmClient();
// Seed enough tool-call turns to exceed the cap
for (let i = 0; i < 10; i++) {
llm.enqueueToolCalls([{ name: 'list_things', args: {} }]);
}
const onToolCall = async (): Promise<ToolResult> => ({
success: true,
message: 'ok',
});
const result = await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
maxRounds: 3,
},
onToolCall,
});
expect(result.rounds).toBe(3);
expect(result.stopReason).toBe('max-rounds');
expect(result.executedCalls).toHaveLength(3);
});
});
describe('runPlannerLoop — parallel reads', () => {
it('runs a batch of parallel-safe tools via Promise.all', async () => {
const llm = new MockLlmClient()
.enqueueToolCalls([
{ name: 'list_things', args: { i: 1 } },
{ name: 'list_things', args: { i: 2 } },
{ name: 'list_things', args: { i: 3 } },
])
.enqueueStop();
let concurrent = 0;
let peakConcurrent = 0;
let completed = 0;
const onToolCall = async (_call: ToolCallRequest): Promise<ToolResult> => {
concurrent++;
peakConcurrent = Math.max(peakConcurrent, concurrent);
await new Promise((r) => setTimeout(r, 10));
concurrent--;
completed++;
return { success: true, message: `done-${completed}` };
};
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
isParallelSafe: (name) => name === 'list_things',
},
onToolCall,
});
// All three ran concurrently — peak should be 3, not 1.
expect(peakConcurrent).toBe(3);
});
it('preserves source order in messages despite parallel completion', async () => {
const llm = new MockLlmClient()
.enqueueToolCalls([
{ name: 'list_things', args: { i: 'a' } },
{ name: 'list_things', args: { i: 'b' } },
{ name: 'list_things', args: { i: 'c' } },
])
.enqueueStop();
// Reverse completion order: first call finishes last.
const delays: Record<string, number> = { a: 30, b: 10, c: 1 };
const onToolCall = async (call: ToolCallRequest): Promise<ToolResult> => {
const i = call.arguments.i as string;
await new Promise((r) => setTimeout(r, delays[i]));
return { success: true, message: `item-${i}` };
};
const result = await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
isParallelSafe: () => true,
},
onToolCall,
});
// executedCalls follows source order
expect(result.executedCalls.map((ec) => ec.call.arguments.i)).toEqual(['a', 'b', 'c']);
// Tool messages on the NEXT LLM call are in source order too
const toolMsgs = llm.calls[1].messages.filter((m) => m.role === 'tool');
expect(toolMsgs.map((m) => m.content)).toEqual([
expect.stringContaining('item-a'),
expect.stringContaining('item-b'),
expect.stringContaining('item-c'),
]);
});
it('falls back to sequential when any call is not parallel-safe', async () => {
const llm = new MockLlmClient()
.enqueueToolCalls([
{ name: 'list_things', args: {} },
{ name: 'create_thing', args: { title: 'x' } }, // unsafe
{ name: 'list_things', args: {} },
])
.enqueueStop();
let concurrent = 0;
let peakConcurrent = 0;
const onToolCall = async (): Promise<ToolResult> => {
concurrent++;
peakConcurrent = Math.max(peakConcurrent, concurrent);
await new Promise((r) => setTimeout(r, 5));
concurrent--;
return { success: true, message: 'ok' };
};
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
isParallelSafe: (name) => name === 'list_things',
},
onToolCall,
});
// Mixed batch ran sequentially — peak concurrency stayed at 1.
expect(peakConcurrent).toBe(1);
});
it('batches more than PARALLEL_TOOL_BATCH_SIZE calls', async () => {
const N = 15; // > 10-call ceiling
const llm = new MockLlmClient()
.enqueueToolCalls(Array.from({ length: N }, (_, i) => ({ name: 'list_things', args: { i } })))
.enqueueStop();
let concurrent = 0;
let peakConcurrent = 0;
const onToolCall = async (): Promise<ToolResult> => {
concurrent++;
peakConcurrent = Math.max(peakConcurrent, concurrent);
await new Promise((r) => setTimeout(r, 15));
concurrent--;
return { success: true, message: 'ok' };
};
const result = await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
isParallelSafe: () => true,
},
onToolCall,
});
// Capped at the batch size — the 11th onwards had to wait.
expect(peakConcurrent).toBeLessThanOrEqual(10);
// All still executed, all in source order.
expect(result.executedCalls).toHaveLength(N);
expect(result.executedCalls.map((ec) => ec.call.arguments.i)).toEqual(
Array.from({ length: N }, (_, i) => i)
);
});
it('stays sequential when isParallelSafe is not provided (pre-M1 default)', async () => {
const llm = new MockLlmClient()
.enqueueToolCalls([
{ name: 'list_things', args: {} },
{ name: 'list_things', args: {} },
])
.enqueueStop();
let concurrent = 0;
let peakConcurrent = 0;
const onToolCall = async (): Promise<ToolResult> => {
concurrent++;
peakConcurrent = Math.max(peakConcurrent, concurrent);
await new Promise((r) => setTimeout(r, 5));
concurrent--;
return { success: true, message: 'ok' };
};
await runPlannerLoop({
llm,
input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
onToolCall,
});
expect(peakConcurrent).toBe(1);
});
});
describe('runPlannerLoop — compactor', () => {
it('does not compact below the threshold', async () => {
const llm = new MockLlmClient();
(llm as unknown as { queue: unknown[] }).queue.push({
content: null,
toolCalls: [{ id: 'c1', name: 'list_things', arguments: {} }],
finishReason: 'tool_calls',
usage: { promptTokens: 500, completionTokens: 0, totalTokens: 500 }, // 50%
});
llm.enqueueStop('done');
const compactSpy = vi.fn();
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
compactor: {
maxContextTokens: 1000,
compact: async (m) => {
compactSpy();
return { messages: m, compactedTurns: 0 };
},
},
},
onToolCall: async () => ({ success: true, message: 'ok' }),
});
expect(compactSpy).not.toHaveBeenCalled();
});
it('fires when usage crosses the threshold and replaces messages', async () => {
const llm = new MockLlmClient();
// Round 1: tool call that reports 92% of the 1000-token budget
(llm as unknown as { queue: unknown[] }).queue.push({
content: null,
toolCalls: [{ id: 'c1', name: 'list_things', arguments: {} }],
finishReason: 'tool_calls',
usage: { promptTokens: 920, completionTokens: 0, totalTokens: 920 },
});
// Round 2: after compaction fires, the LLM stops
llm.enqueueStop('done');
let compactorInput: readonly { role: string; content?: string | null }[] = [];
await runPlannerLoop({
llm,
input: {
systemPrompt: 's-prompt',
userPrompt: 'u-prompt',
tools,
model: 'm',
compactor: {
maxContextTokens: 1000,
compact: async (m) => {
compactorInput = m;
return {
messages: [
{ role: 'system', content: 's-prompt' },
{ role: 'user', content: 'u-prompt' },
{ role: 'assistant', content: '<compact-summary>FOLDED</compact-summary>' },
],
compactedTurns: 2,
};
},
},
},
onToolCall: async () => ({ success: true, message: 'ok' }),
});
// The compactor received the full post-round-1 history
expect(compactorInput.length).toBeGreaterThan(2);
// The round-2 LLM request saw the compacted history, not the raw one
const round2Seen = llm.calls[1].messages;
expect(round2Seen).toHaveLength(3);
expect(round2Seen[2].content).toContain('FOLDED');
});
it('fires at most once per run', async () => {
const llm = new MockLlmClient();
for (let i = 0; i < 4; i++) {
(llm as unknown as { queue: unknown[] }).queue.push({
content: null,
toolCalls: [{ id: `c${i}`, name: 'list_things', arguments: {} }],
finishReason: 'tool_calls',
usage: { promptTokens: 950, completionTokens: 0, totalTokens: 950 }, // always over threshold
});
}
llm.enqueueStop('done');
let compactCallCount = 0;
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
maxRounds: 10,
compactor: {
maxContextTokens: 1000,
compact: async () => {
compactCallCount++;
return {
messages: [
{ role: 'system', content: 's' },
{ role: 'user', content: 'u' },
{ role: 'assistant', content: '<compact>' },
],
compactedTurns: 2,
};
},
},
},
onToolCall: async () => ({ success: true, message: 'ok' }),
});
expect(compactCallCount).toBe(1);
});
it('bails out silently when maxContextTokens is 0', async () => {
const llm = new MockLlmClient();
(llm as unknown as { queue: unknown[] }).queue.push({
content: 'done',
toolCalls: [],
finishReason: 'stop',
usage: { promptTokens: 9_999, completionTokens: 0, totalTokens: 9_999 },
});
const compactSpy = vi.fn();
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
compactor: {
maxContextTokens: 0, // disabled
compact: async (m) => {
compactSpy();
return { messages: m, compactedTurns: 0 };
},
},
},
onToolCall: async () => ({ success: true, message: 'ok' }),
});
expect(compactSpy).not.toHaveBeenCalled();
});
it('surfaces compactionsDone in LoopState for reminder producers', async () => {
const llm = new MockLlmClient();
// Round 1: over threshold
(llm as unknown as { queue: unknown[] }).queue.push({
content: null,
toolCalls: [{ id: 'c1', name: 'list_things', arguments: {} }],
finishReason: 'tool_calls',
usage: { promptTokens: 950, completionTokens: 0, totalTokens: 950 },
});
// Round 2: stop so we end cleanly
llm.enqueueStop('done');
const compactionsDoneSeen: number[] = [];
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
compactor: {
maxContextTokens: 1000,
compact: async () => ({
messages: [
{ role: 'system', content: 's' },
{ role: 'user', content: 'u' },
{ role: 'assistant', content: '<compact>' },
],
compactedTurns: 2,
}),
},
reminderChannel: (state) => {
compactionsDoneSeen.push(state.compactionsDone);
return [];
},
},
onToolCall: async () => ({ success: true, message: 'ok' }),
});
// Round 1 channel call: before compaction fires, so 0
// Round 2 channel call: after compaction, so 1
expect(compactionsDoneSeen).toEqual([0, 1]);
});
it('skips when the compactor returns 0 compacted turns', async () => {
const llm = new MockLlmClient();
(llm as unknown as { queue: unknown[] }).queue.push({
content: null,
toolCalls: [{ id: 'c1', name: 'list_things', arguments: {} }],
finishReason: 'tool_calls',
usage: { promptTokens: 950, completionTokens: 0, totalTokens: 950 },
});
llm.enqueueStop('done');
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
compactor: {
maxContextTokens: 1000,
compact: async (m) => ({ messages: m, compactedTurns: 0 }),
},
},
onToolCall: async () => ({ success: true, message: 'ok' }),
});
// Round 2 should have seen the ORIGINAL history (untouched by the
// no-op compactor) — just system + user + assistant + tool
const round2Seen = llm.calls[1].messages;
expect(round2Seen).toHaveLength(4);
});
});
describe('runPlannerLoop — reminderChannel', () => {
it('injects reminders as transient system messages on the LLM call', async () => {
const llm = new MockLlmClient().enqueueStop('done');
const result = await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
reminderChannel: () => ['budget 80%', 'mission overdue'],
},
onToolCall: vi.fn(),
});
// The request messages the mock saw must include the reminders
// AFTER the user turn, each wrapped in <reminder> tags.
const seenByLlm = llm.calls[0].messages;
expect(seenByLlm).toHaveLength(4); // system + user + 2 reminders
expect(seenByLlm[0].role).toBe('system');
expect(seenByLlm[0].content).toBe('s');
expect(seenByLlm[1].role).toBe('user');
expect(seenByLlm[2].role).toBe('system');
expect(seenByLlm[2].content).toBe('<reminder>budget 80%</reminder>');
expect(seenByLlm[3].role).toBe('system');
expect(seenByLlm[3].content).toBe('<reminder>mission overdue</reminder>');
// And the persisted history must NOT contain them.
expect(result.messages.find((m) => m.content?.includes('<reminder>'))).toBeUndefined();
});
it('is called per round with fresh state — round 2 does not see round 1 reminders', async () => {
const llm = new MockLlmClient()
.enqueueToolCalls([{ name: 'list_things', args: {} }])
.enqueueStop('done');
const channelCalls: Array<{ round: number; reminders: string[] }> = [];
const channel = vi.fn((state) => {
const reminders = [`round-${state.round}`];
channelCalls.push({ round: state.round, reminders });
return reminders;
});
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
reminderChannel: channel,
},
onToolCall: async () => ({ success: true, message: 'ok' }),
});
expect(channel).toHaveBeenCalledTimes(2);
expect(channelCalls).toEqual([
{ round: 1, reminders: ['round-1'] },
{ round: 2, reminders: ['round-2'] },
]);
// Round 2's request must have ONLY round-2's reminder, not round-1's.
const round2Seen = llm.calls[1].messages;
const reminders = round2Seen.filter((m) => m.content?.includes('<reminder>'));
expect(reminders).toHaveLength(1);
expect(reminders[0].content).toBe('<reminder>round-2</reminder>');
});
it('exposes recentCalls as a sliding window, oldest-first', async () => {
// 7 rounds, each with one tool call, so by round 7 we have 6 prior
// results — the window must cap at LOOP_STATE_RECENT_CALLS_WINDOW = 5.
const llm = new MockLlmClient();
for (let i = 0; i < 7; i++) {
llm.enqueueToolCalls([{ name: 'list_things', args: { i } }]);
}
llm.enqueueStop();
const windowsSeen: Array<Array<{ i: unknown; ok: boolean }>> = [];
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
maxRounds: 10,
reminderChannel: (state) => {
windowsSeen.push(
state.recentCalls.map((ec) => ({
i: ec.call.arguments.i,
ok: ec.result.success,
}))
);
return [];
},
},
onToolCall: async (call) => ({
success: true,
message: `ok-${call.arguments.i}`,
}),
});
// Round 1 → window empty
expect(windowsSeen[0]).toEqual([]);
// Round 2 → one prior call
expect(windowsSeen[1]).toEqual([{ i: 0, ok: true }]);
// Round 6 → five prior calls, oldest-first
expect(windowsSeen[5]).toEqual([
{ i: 0, ok: true },
{ i: 1, ok: true },
{ i: 2, ok: true },
{ i: 3, ok: true },
{ i: 4, ok: true },
]);
// Round 7 → window slides; i=0 drops off, i=5 is newest
expect(windowsSeen[6]).toEqual([
{ i: 1, ok: true },
{ i: 2, ok: true },
{ i: 3, ok: true },
{ i: 4, ok: true },
{ i: 5, ok: true },
]);
});
it('surfaces loop state — toolCallCount and lastCall — to the channel', async () => {
const llm = new MockLlmClient()
.enqueueToolCalls([{ name: 'list_things', args: {} }])
.enqueueToolCalls([{ name: 'create_thing', args: { title: 'x' } }])
.enqueueStop('done');
const snapshots: Array<{ round: number; toolCallCount: number; lastName?: string }> = [];
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
reminderChannel: (state) => {
snapshots.push({
round: state.round,
toolCallCount: state.toolCallCount,
lastName: state.lastCall?.call.name,
});
return [];
},
},
onToolCall: async () => ({ success: true, message: 'ok' }),
});
expect(snapshots).toEqual([
{ round: 1, toolCallCount: 0, lastName: undefined },
{ round: 2, toolCallCount: 1, lastName: 'list_things' },
{ round: 3, toolCallCount: 2, lastName: 'create_thing' },
]);
});
it('empty reminders array leaves the request unchanged', async () => {
const llm = new MockLlmClient().enqueueStop('done');
await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
reminderChannel: () => [],
},
onToolCall: vi.fn(),
});
const seenByLlm = llm.calls[0].messages;
expect(seenByLlm).toHaveLength(2); // just system + user
});
});