managarten/packages/shared-ai/src/planner/loop.test.ts

import { describe, expect, it, vi } from 'vitest';
import { runPlannerLoop, type ToolCallRequest, type ToolResult } from './loop';
import { MockLlmClient } from './mock-llm';
import type { ToolSchema } from '../tools/schemas';

const tools: ToolSchema[] = [
	{
		name: 'list_things',
		module: 'test',
		description: 'list things',
		defaultPolicy: 'auto',
		parameters: [],
	},
	{
		name: 'create_thing',
		module: 'test',
		description: 'create a thing',
		defaultPolicy: 'propose',
		parameters: [{ name: 'title', type: 'string', description: 'title', required: true }],
	},
];

describe('runPlannerLoop', () => {
	it('stops immediately when the LLM emits no tool_calls', async () => {
		const llm = new MockLlmClient().enqueueStop('done');
		const onToolCall = vi.fn();
		const result = await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'test/model',
			},
			onToolCall,
		});
		expect(result.rounds).toBe(1);
		expect(result.executedCalls).toHaveLength(0);
		expect(result.summary).toBe('done');
		expect(result.stopReason).toBe('assistant-stop');
		expect(onToolCall).not.toHaveBeenCalled();
	});

	it('executes a single tool call and feeds the result back', async () => {
		const llm = new MockLlmClient()
			.enqueueToolCalls([{ name: 'list_things', args: {} }])
			.enqueueStop('all done');

		const onToolCall = vi.fn(
			async (_call: ToolCallRequest): Promise<ToolResult> => ({
				success: true,
				data: ['a', 'b'],
				message: '2 things',
			})
		);

		const result = await runPlannerLoop({
			llm,
			input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
			onToolCall,
		});

		expect(result.rounds).toBe(2);
		expect(result.executedCalls).toHaveLength(1);
		expect(result.executedCalls[0].call.name).toBe('list_things');
		expect(result.summary).toBe('all done');
		expect(result.stopReason).toBe('assistant-stop');

		// Second LLM call must have seen the tool result in its messages.
		expect(llm.calls[1].messages).toHaveLength(4); // system + user + assistant + tool
		const toolMsg = llm.calls[1].messages[3];
		expect(toolMsg.role).toBe('tool');
		expect(toolMsg.content).toContain('2 things');
	});

	it('executes parallel tool calls sequentially', async () => {
		const llm = new MockLlmClient()
			.enqueueToolCalls([
				{ name: 'create_thing', args: { title: 'a' } },
				{ name: 'create_thing', args: { title: 'b' } },
				{ name: 'create_thing', args: { title: 'c' } },
			])
			.enqueueStop();

		const executedInOrder: string[] = [];
		const onToolCall = async (call: ToolCallRequest): Promise<ToolResult> => {
			executedInOrder.push(call.arguments.title as string);
			return { success: true, message: 'ok' };
		};

		const result = await runPlannerLoop({
			llm,
			input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
			onToolCall,
		});

		expect(executedInOrder).toEqual(['a', 'b', 'c']);
		expect(result.executedCalls).toHaveLength(3);
	});

	it('propagates tool failures as tool-messages (LLM can react)', async () => {
		const llm = new MockLlmClient()
			.enqueueToolCalls([{ name: 'list_things', args: {} }])
			.enqueueStop('ack');

		const onToolCall = async (): Promise<ToolResult> => ({
			success: false,
			message: 'db locked',
		});

		const result = await runPlannerLoop({
			llm,
			input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
			onToolCall,
		});

		const toolMsg = llm.calls[1].messages[3];
		expect(toolMsg.content).toContain('db locked');
		expect(toolMsg.content).toContain('"success":false');
		expect(result.executedCalls[0].result.success).toBe(false);
	});

	it('honours the maxRounds ceiling', async () => {
		const llm = new MockLlmClient();
		// Seed enough tool-call turns to exceed the cap
		for (let i = 0; i < 10; i++) {
			llm.enqueueToolCalls([{ name: 'list_things', args: {} }]);
		}
		const onToolCall = async (): Promise<ToolResult> => ({
			success: true,
			message: 'ok',
		});

		const result = await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				maxRounds: 3,
			},
			onToolCall,
		});

		expect(result.rounds).toBe(3);
		expect(result.stopReason).toBe('max-rounds');
		expect(result.executedCalls).toHaveLength(3);
	});
});

describe('runPlannerLoop — parallel reads', () => {
	it('runs a batch of parallel-safe tools via Promise.all', async () => {
		const llm = new MockLlmClient()
			.enqueueToolCalls([
				{ name: 'list_things', args: { i: 1 } },
				{ name: 'list_things', args: { i: 2 } },
				{ name: 'list_things', args: { i: 3 } },
			])
			.enqueueStop();

		let concurrent = 0;
		let peakConcurrent = 0;
		let completed = 0;
		const onToolCall = async (_call: ToolCallRequest): Promise<ToolResult> => {
			concurrent++;
			peakConcurrent = Math.max(peakConcurrent, concurrent);
			await new Promise((r) => setTimeout(r, 10));
			concurrent--;
			completed++;
			return { success: true, message: `done-${completed}` };
		};

		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				isParallelSafe: (name) => name === 'list_things',
			},
			onToolCall,
		});

		// All three ran concurrently — peak should be 3, not 1.
		expect(peakConcurrent).toBe(3);
	});

	it('preserves source order in messages despite parallel completion', async () => {
		const llm = new MockLlmClient()
			.enqueueToolCalls([
				{ name: 'list_things', args: { i: 'a' } },
				{ name: 'list_things', args: { i: 'b' } },
				{ name: 'list_things', args: { i: 'c' } },
			])
			.enqueueStop();

		// Reverse completion order: first call finishes last.
		const delays: Record<string, number> = { a: 30, b: 10, c: 1 };
		const onToolCall = async (call: ToolCallRequest): Promise<ToolResult> => {
			const i = call.arguments.i as string;
			await new Promise((r) => setTimeout(r, delays[i]));
			return { success: true, message: `item-${i}` };
		};

		const result = await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				isParallelSafe: () => true,
			},
			onToolCall,
		});

		// executedCalls follows source order
		expect(result.executedCalls.map((ec) => ec.call.arguments.i)).toEqual(['a', 'b', 'c']);

		// Tool messages on the NEXT LLM call are in source order too
		const toolMsgs = llm.calls[1].messages.filter((m) => m.role === 'tool');
		expect(toolMsgs.map((m) => m.content)).toEqual([
			expect.stringContaining('item-a'),
			expect.stringContaining('item-b'),
			expect.stringContaining('item-c'),
		]);
	});

	it('falls back to sequential when any call is not parallel-safe', async () => {
		const llm = new MockLlmClient()
			.enqueueToolCalls([
				{ name: 'list_things', args: {} },
				{ name: 'create_thing', args: { title: 'x' } }, // unsafe
				{ name: 'list_things', args: {} },
			])
			.enqueueStop();

		let concurrent = 0;
		let peakConcurrent = 0;
		const onToolCall = async (): Promise<ToolResult> => {
			concurrent++;
			peakConcurrent = Math.max(peakConcurrent, concurrent);
			await new Promise((r) => setTimeout(r, 5));
			concurrent--;
			return { success: true, message: 'ok' };
		};

		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				isParallelSafe: (name) => name === 'list_things',
			},
			onToolCall,
		});

		// Mixed batch ran sequentially — peak concurrency stayed at 1.
		expect(peakConcurrent).toBe(1);
	});

	it('batches more than PARALLEL_TOOL_BATCH_SIZE calls', async () => {
		const N = 15; // > 10-call ceiling
		const llm = new MockLlmClient()
			.enqueueToolCalls(Array.from({ length: N }, (_, i) => ({ name: 'list_things', args: { i } })))
			.enqueueStop();

		let concurrent = 0;
		let peakConcurrent = 0;
		const onToolCall = async (): Promise<ToolResult> => {
			concurrent++;
			peakConcurrent = Math.max(peakConcurrent, concurrent);
			await new Promise((r) => setTimeout(r, 15));
			concurrent--;
			return { success: true, message: 'ok' };
		};

		const result = await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				isParallelSafe: () => true,
			},
			onToolCall,
		});

		// Capped at the batch size — the 11th onwards had to wait.
		expect(peakConcurrent).toBeLessThanOrEqual(10);
		// All still executed, all in source order.
		expect(result.executedCalls).toHaveLength(N);
		expect(result.executedCalls.map((ec) => ec.call.arguments.i)).toEqual(
			Array.from({ length: N }, (_, i) => i)
		);
	});

	it('stays sequential when isParallelSafe is not provided (pre-M1 default)', async () => {
		const llm = new MockLlmClient()
			.enqueueToolCalls([
				{ name: 'list_things', args: {} },
				{ name: 'list_things', args: {} },
			])
			.enqueueStop();

		let concurrent = 0;
		let peakConcurrent = 0;
		const onToolCall = async (): Promise<ToolResult> => {
			concurrent++;
			peakConcurrent = Math.max(peakConcurrent, concurrent);
			await new Promise((r) => setTimeout(r, 5));
			concurrent--;
			return { success: true, message: 'ok' };
		};

		await runPlannerLoop({
			llm,
			input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
			onToolCall,
		});

		expect(peakConcurrent).toBe(1);
	});
});

describe('runPlannerLoop — compactor', () => {
	it('does not compact below the threshold', async () => {
		const llm = new MockLlmClient();
		(llm as unknown as { queue: unknown[] }).queue.push({
			content: null,
			toolCalls: [{ id: 'c1', name: 'list_things', arguments: {} }],
			finishReason: 'tool_calls',
			usage: { promptTokens: 500, completionTokens: 0, totalTokens: 500 }, // 50%
		});
		llm.enqueueStop('done');

		const compactSpy = vi.fn();
		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				compactor: {
					maxContextTokens: 1000,
					compact: async (m) => {
						compactSpy();
						return { messages: m, compactedTurns: 0 };
					},
				},
			},
			onToolCall: async () => ({ success: true, message: 'ok' }),
		});

		expect(compactSpy).not.toHaveBeenCalled();
	});

	it('fires when usage crosses the threshold and replaces messages', async () => {
		const llm = new MockLlmClient();
		// Round 1: tool call that reports 92% of the 1000-token budget
		(llm as unknown as { queue: unknown[] }).queue.push({
			content: null,
			toolCalls: [{ id: 'c1', name: 'list_things', arguments: {} }],
			finishReason: 'tool_calls',
			usage: { promptTokens: 920, completionTokens: 0, totalTokens: 920 },
		});
		// Round 2: after compaction fires, the LLM stops
		llm.enqueueStop('done');

		let compactorInput: readonly { role: string; content?: string | null }[] = [];
		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's-prompt',
				userPrompt: 'u-prompt',
				tools,
				model: 'm',
				compactor: {
					maxContextTokens: 1000,
					compact: async (m) => {
						compactorInput = m;
						return {
							messages: [
								{ role: 'system', content: 's-prompt' },
								{ role: 'user', content: 'u-prompt' },
								{ role: 'assistant', content: '<compact-summary>FOLDED</compact-summary>' },
							],
							compactedTurns: 2,
						};
					},
				},
			},
			onToolCall: async () => ({ success: true, message: 'ok' }),
		});

		// The compactor received the full post-round-1 history
		expect(compactorInput.length).toBeGreaterThan(2);
		// The round-2 LLM request saw the compacted history, not the raw one
		const round2Seen = llm.calls[1].messages;
		expect(round2Seen).toHaveLength(3);
		expect(round2Seen[2].content).toContain('FOLDED');
	});

	it('fires at most once per run', async () => {
		const llm = new MockLlmClient();
		for (let i = 0; i < 4; i++) {
			(llm as unknown as { queue: unknown[] }).queue.push({
				content: null,
				toolCalls: [{ id: `c${i}`, name: 'list_things', arguments: {} }],
				finishReason: 'tool_calls',
				usage: { promptTokens: 950, completionTokens: 0, totalTokens: 950 }, // always over threshold
			});
		}
		llm.enqueueStop('done');

		let compactCallCount = 0;
		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				maxRounds: 10,
				compactor: {
					maxContextTokens: 1000,
					compact: async () => {
						compactCallCount++;
						return {
							messages: [
								{ role: 'system', content: 's' },
								{ role: 'user', content: 'u' },
								{ role: 'assistant', content: '<compact>' },
							],
							compactedTurns: 2,
						};
					},
				},
			},
			onToolCall: async () => ({ success: true, message: 'ok' }),
		});

		expect(compactCallCount).toBe(1);
	});

	it('bails out silently when maxContextTokens is 0', async () => {
		const llm = new MockLlmClient();
		(llm as unknown as { queue: unknown[] }).queue.push({
			content: 'done',
			toolCalls: [],
			finishReason: 'stop',
			usage: { promptTokens: 9_999, completionTokens: 0, totalTokens: 9_999 },
		});

		const compactSpy = vi.fn();
		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				compactor: {
					maxContextTokens: 0, // disabled
					compact: async (m) => {
						compactSpy();
						return { messages: m, compactedTurns: 0 };
					},
				},
			},
			onToolCall: async () => ({ success: true, message: 'ok' }),
		});

		expect(compactSpy).not.toHaveBeenCalled();
	});

	it('surfaces compactionsDone in LoopState for reminder producers', async () => {
		const llm = new MockLlmClient();
		// Round 1: over threshold
		(llm as unknown as { queue: unknown[] }).queue.push({
			content: null,
			toolCalls: [{ id: 'c1', name: 'list_things', arguments: {} }],
			finishReason: 'tool_calls',
			usage: { promptTokens: 950, completionTokens: 0, totalTokens: 950 },
		});
		// Round 2: stop so we end cleanly
		llm.enqueueStop('done');

		const compactionsDoneSeen: number[] = [];
		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				compactor: {
					maxContextTokens: 1000,
					compact: async () => ({
						messages: [
							{ role: 'system', content: 's' },
							{ role: 'user', content: 'u' },
							{ role: 'assistant', content: '<compact>' },
						],
						compactedTurns: 2,
					}),
				},
				reminderChannel: (state) => {
					compactionsDoneSeen.push(state.compactionsDone);
					return [];
				},
			},
			onToolCall: async () => ({ success: true, message: 'ok' }),
		});

		// Round 1 channel call: before compaction fires, so 0
		// Round 2 channel call: after compaction, so 1
		expect(compactionsDoneSeen).toEqual([0, 1]);
	});

	it('skips when the compactor returns 0 compacted turns', async () => {
		const llm = new MockLlmClient();
		(llm as unknown as { queue: unknown[] }).queue.push({
			content: null,
			toolCalls: [{ id: 'c1', name: 'list_things', arguments: {} }],
			finishReason: 'tool_calls',
			usage: { promptTokens: 950, completionTokens: 0, totalTokens: 950 },
		});
		llm.enqueueStop('done');

		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				compactor: {
					maxContextTokens: 1000,
					compact: async (m) => ({ messages: m, compactedTurns: 0 }),
				},
			},
			onToolCall: async () => ({ success: true, message: 'ok' }),
		});

		// Round 2 should have seen the ORIGINAL history (untouched by the
		// no-op compactor) — just system + user + assistant + tool
		const round2Seen = llm.calls[1].messages;
		expect(round2Seen).toHaveLength(4);
	});
});

describe('runPlannerLoop — reminderChannel', () => {
	it('injects reminders as transient system messages on the LLM call', async () => {
		const llm = new MockLlmClient().enqueueStop('done');
		const result = await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				reminderChannel: () => ['budget 80%', 'mission overdue'],
			},
			onToolCall: vi.fn(),
		});

		// The request messages the mock saw must include the reminders
		// AFTER the user turn, each wrapped in <reminder> tags.
		const seenByLlm = llm.calls[0].messages;
		expect(seenByLlm).toHaveLength(4); // system + user + 2 reminders
		expect(seenByLlm[0].role).toBe('system');
		expect(seenByLlm[0].content).toBe('s');
		expect(seenByLlm[1].role).toBe('user');
		expect(seenByLlm[2].role).toBe('system');
		expect(seenByLlm[2].content).toBe('<reminder>budget 80%</reminder>');
		expect(seenByLlm[3].role).toBe('system');
		expect(seenByLlm[3].content).toBe('<reminder>mission overdue</reminder>');

		// And the persisted history must NOT contain them.
		expect(result.messages.find((m) => m.content?.includes('<reminder>'))).toBeUndefined();
	});

	it('is called per round with fresh state — round 2 does not see round 1 reminders', async () => {
		const llm = new MockLlmClient()
			.enqueueToolCalls([{ name: 'list_things', args: {} }])
			.enqueueStop('done');

		const channelCalls: Array<{ round: number; reminders: string[] }> = [];
		const channel = vi.fn((state) => {
			const reminders = [`round-${state.round}`];
			channelCalls.push({ round: state.round, reminders });
			return reminders;
		});

		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				reminderChannel: channel,
			},
			onToolCall: async () => ({ success: true, message: 'ok' }),
		});

		expect(channel).toHaveBeenCalledTimes(2);
		expect(channelCalls).toEqual([
			{ round: 1, reminders: ['round-1'] },
			{ round: 2, reminders: ['round-2'] },
		]);

		// Round 2's request must have ONLY round-2's reminder, not round-1's.
		const round2Seen = llm.calls[1].messages;
		const reminders = round2Seen.filter((m) => m.content?.includes('<reminder>'));
		expect(reminders).toHaveLength(1);
		expect(reminders[0].content).toBe('<reminder>round-2</reminder>');
	});

	it('exposes recentCalls as a sliding window, oldest-first', async () => {
		// 7 rounds, each with one tool call, so by round 7 we have 6 prior
		// results — the window must cap at LOOP_STATE_RECENT_CALLS_WINDOW = 5.
		const llm = new MockLlmClient();
		for (let i = 0; i < 7; i++) {
			llm.enqueueToolCalls([{ name: 'list_things', args: { i } }]);
		}
		llm.enqueueStop();

		const windowsSeen: Array<Array<{ i: unknown; ok: boolean }>> = [];
		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				maxRounds: 10,
				reminderChannel: (state) => {
					windowsSeen.push(
						state.recentCalls.map((ec) => ({
							i: ec.call.arguments.i,
							ok: ec.result.success,
						}))
					);
					return [];
				},
			},
			onToolCall: async (call) => ({
				success: true,
				message: `ok-${call.arguments.i}`,
			}),
		});

		// Round 1 → window empty
		expect(windowsSeen[0]).toEqual([]);
		// Round 2 → one prior call
		expect(windowsSeen[1]).toEqual([{ i: 0, ok: true }]);
		// Round 6 → five prior calls, oldest-first
		expect(windowsSeen[5]).toEqual([
			{ i: 0, ok: true },
			{ i: 1, ok: true },
			{ i: 2, ok: true },
			{ i: 3, ok: true },
			{ i: 4, ok: true },
		]);
		// Round 7 → window slides; i=0 drops off, i=5 is newest
		expect(windowsSeen[6]).toEqual([
			{ i: 1, ok: true },
			{ i: 2, ok: true },
			{ i: 3, ok: true },
			{ i: 4, ok: true },
			{ i: 5, ok: true },
		]);
	});

	it('surfaces loop state — toolCallCount and lastCall — to the channel', async () => {
		const llm = new MockLlmClient()
			.enqueueToolCalls([{ name: 'list_things', args: {} }])
			.enqueueToolCalls([{ name: 'create_thing', args: { title: 'x' } }])
			.enqueueStop('done');

		const snapshots: Array<{ round: number; toolCallCount: number; lastName?: string }> = [];
		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				reminderChannel: (state) => {
					snapshots.push({
						round: state.round,
						toolCallCount: state.toolCallCount,
						lastName: state.lastCall?.call.name,
					});
					return [];
				},
			},
			onToolCall: async () => ({ success: true, message: 'ok' }),
		});

		expect(snapshots).toEqual([
			{ round: 1, toolCallCount: 0, lastName: undefined },
			{ round: 2, toolCallCount: 1, lastName: 'list_things' },
			{ round: 3, toolCallCount: 2, lastName: 'create_thing' },
		]);
	});

	it('empty reminders array leaves the request unchanged', async () => {
		const llm = new MockLlmClient().enqueueStop('done');
		await runPlannerLoop({
			llm,
			input: {
				systemPrompt: 's',
				userPrompt: 'u',
				tools,
				model: 'm',
				reminderChannel: () => [],
			},
			onToolCall: vi.fn(),
		});

		const seenByLlm = llm.calls[0].messages;
		expect(seenByLlm).toHaveLength(2); // just system + user
	});
});