managarten/packages/shared-ai/src/planner/loop.ts

/**
 * Multi-turn tool-calling loop shared between the webapp runner and the
 * server-side mana-ai tick. Replaces the text-JSON planner pipeline:
 * we hand the LLM a tool catalog, it emits native tool_calls, we
 * execute them and feed the results back as tool-messages until the
 * LLM has nothing more to call (or we hit the round budget).
 *
 * Environment-specific concerns (HTTP transport, auth, actor
 * attribution) live in the caller-provided ``LlmClient`` and
 * ``onToolCall`` callback. The loop itself stays pure.
 */

import type { ToolSchema } from '../tools/schemas';
import type { ToolSpec } from '../tools/function-schema';
import { toolsToFunctionSchemas } from '../tools/function-schema';

// ─── Chat-message contract ──────────────────────────────────────────

export interface ToolCallRequest {
	readonly id: string;
	readonly name: string;
	readonly arguments: Record<string, unknown>;
}

export interface ToolResult {
	readonly success: boolean;
	readonly data?: unknown;
	readonly message: string;
}

export type ChatRole = 'system' | 'user' | 'assistant' | 'tool';

export interface ChatMessage {
	readonly role: ChatRole;
	readonly content?: string | null;
	readonly toolCalls?: readonly ToolCallRequest[];
	readonly toolCallId?: string;
}

// ─── LLM client contract ────────────────────────────────────────────

export interface LlmCompletionRequest {
	readonly messages: readonly ChatMessage[];
	readonly tools: readonly ToolSpec[];
	readonly model: string;
	readonly temperature?: number;
}

export type LlmFinishReason = 'stop' | 'tool_calls' | 'length' | 'content_filter';

export interface TokenUsage {
	readonly promptTokens: number;
	readonly completionTokens: number;
	readonly totalTokens: number;
}

export interface LlmCompletionResponse {
	readonly content: string | null;
	readonly toolCalls: readonly ToolCallRequest[];
	readonly finishReason: LlmFinishReason;
	/** Token counts for this one call — propagated from the provider
	 *  response when available. Summed across rounds in PlannerLoopResult. */
	readonly usage?: TokenUsage;
}

export interface LlmClient {
	complete(req: LlmCompletionRequest): Promise<LlmCompletionResponse>;
}

// ─── Loop input / result ────────────────────────────────────────────

/** Sliding-window size for `LoopState.recentCalls`. Capped so the
 *  reminder channel stays cheap and hint-producers can only reason
 *  over the last handful of calls, which is what retry-loop-style
 *  heuristics need. */
export const LOOP_STATE_RECENT_CALLS_WINDOW = 5;

/**
 * Transient loop state surfaced to the reminderChannel. The reminder
 * callback is pure — it reads this snapshot and returns hints; it does
 * not mutate anything.
 */
export interface LoopState {
	/** 1-based round index for the CURRENT LLM call (before it runs). */
	readonly round: number;
	/** Number of tool calls executed across all prior rounds. */
	readonly toolCallCount: number;
	/** Accumulated tokens reported by the provider, up to (but not
	 *  including) the current round's call. Zero when the provider
	 *  hasn't reported usage. */
	readonly usage: TokenUsage;
	/** The most recent ExecutedCall, or undefined in round 1. Handy for
	 *  "the last tool failed — warn the LLM" producers. */
	readonly lastCall?: ExecutedCall;
	/**
	 * Sliding window of the last N (= `LOOP_STATE_RECENT_CALLS_WINDOW`)
	 * ExecutedCalls in source order, oldest first. Used by producers
	 * that need more than the single-last signal — retry-loop detection
	 * (N consecutive failures), burst detection (many calls to the same
	 * tool), and similar. Empty in round 1; grows up to the cap.
	 */
	readonly recentCalls: readonly ExecutedCall[];
	/**
	 * Number of times the compactor has folded the message history in
	 * this loop run. Capped at 1 by the loop itself (fire-once policy),
	 * but still exposed as a count rather than a boolean so future
	 * policies (e.g. multi-compact cycles) don't need a breaking API
	 * change. A producer can use this to inject a "just compacted"
	 * reminder on the round immediately after compaction.
	 */
	readonly compactionsDone: number;
}

/**
 * Callback that yields transient system-message strings to attach to the
 * NEXT LLM request only. Returned strings are wrapped in `<reminder>…
 * </reminder>` tags and injected as system messages AFTER the persistent
 * `messages` history. They are NEVER written back to `messages[]` and
 * therefore NEVER appear in `PlannerLoopResult.messages`.
 *
 * This is the Claude-Code `<system-reminder>` pattern: steering the model
 * per-turn without polluting the persisted conversation log or
 * invalidating the provider's KV-cache on stable prefixes.
 */
export type ReminderChannel = (state: LoopState) => readonly string[];

export interface PlannerLoopInput {
	readonly systemPrompt: string;
	readonly userPrompt: string;
	/** Optional prior conversation turns inserted between the system
	 *  prompt and the new user turn. Used by the companion chat to
	 *  preserve multi-turn history; missions leave this empty. */
	readonly priorMessages?: readonly ChatMessage[];
	readonly tools: readonly ToolSchema[];
	readonly model: string;
	readonly temperature?: number;
	/** Hard ceiling on planner rounds. Each round = one LLM call plus
	 *  whatever tool executions its output triggered. Defaults to 5. */
	readonly maxRounds?: number;
	/** Optional per-round reminder producer — see ReminderChannel docs. */
	readonly reminderChannel?: ReminderChannel;
	/**
	 * Predicate that decides whether a tool is safe to execute in parallel
	 * with other tools of the same stripe. Claude-Code `gW5` pattern: when
	 * every tool_call in a round is parallel-safe, they run via Promise.all
	 * in batches of 10; if any call is NOT parallel-safe, the whole batch
	 * falls back to sequential (preserves ordering invariants for
	 * write-after-read chains).
	 *
	 * Default: `() => false` → fully sequential, matching pre-M1 behaviour.
	 *
	 * The predicate is called once per tool_call per round, so cheap
	 * constant-time lookups are expected (registry hit, name-prefix check).
	 */
	readonly isParallelSafe?: (toolName: string) => boolean;
	/**
	 * Context-window compactor wiring (Claude-Code `wU2` pattern).
	 *
	 * When set AND usage crosses the threshold, the loop replaces the
	 * middle of the message history with a compact summary before the
	 * next LLM call. The compact summary is persisted in the returned
	 * `messages` — unlike reminders, this IS part of the canonical
	 * history because raw turns got dropped.
	 *
	 * Contract:
	 *   - `maxContextTokens`: provider ceiling; compactor skips when unset
	 *     (matches `shouldCompact()`'s safe-bail behaviour).
	 *   - `compact`: async callback that performs the compaction. Pass
	 *     `compactHistory` from this package or an adapter that uses a
	 *     cheaper model (e.g. Haiku) for the compactor's LLM call.
	 *   - `threshold`: optional override, default 0.92.
	 *
	 * Compaction fires at MOST once per loop run — once a round has been
	 * compacted, we don't re-trigger until the next run, even if the
	 * fresh history hits the threshold again (defence-in-depth against
	 * a runaway tool that keeps bloating turns).
	 */
	readonly compactor?: {
		readonly maxContextTokens: number;
		readonly threshold?: number;
		readonly compact: (
			messages: readonly ChatMessage[]
		) => Promise<{ readonly messages: readonly ChatMessage[]; readonly compactedTurns: number }>;
	};
}

/** Max concurrent tool executions per round. Mirrors Claude Code's gW5
 *  ceiling. Keeps tail latency bounded when the LLM requests many reads
 *  at once and protects downstream services from unbounded fan-out. */
export const PARALLEL_TOOL_BATCH_SIZE = 10;

export interface ExecutedCall {
	readonly round: number;
	readonly call: ToolCallRequest;
	readonly result: ToolResult;
}

export type LoopStopReason = 'assistant-stop' | 'max-rounds' | 'no-tool-calls' | 'llm-error';

export interface PlannerLoopResult {
	readonly rounds: number;
	readonly executedCalls: readonly ExecutedCall[];
	/** Final assistant text when the LLM stopped instead of calling a
	 *  tool. ``null`` when the last turn was a tool-call burst that we
	 *  cut off via round budget. */
	readonly summary: string | null;
	readonly stopReason: LoopStopReason;
	/** Complete chat history for debug-log capture (system + user +
	 *  every assistant/tool turn). Never synced — contains decrypted
	 *  user content. */
	readonly messages: readonly ChatMessage[];
	/** Accumulated token usage across every LLM round. Zero counts when
	 *  the provider didn't report usage. Consumers use this for budget
	 *  tracking (mana-ai's per-agent daily limit) and cost telemetry. */
	readonly usage: TokenUsage;
}

// ─── The loop ───────────────────────────────────────────────────────

const DEFAULT_MAX_ROUNDS = 5;

export async function runPlannerLoop(opts: {
	readonly llm: LlmClient;
	readonly input: PlannerLoopInput;
	/** Execute a tool call and return the result that should be fed back
	 *  to the LLM as a tool-message. Must not throw — convert errors to
	 *  ``{ success: false, message }``. The loop injects the result
	 *  verbatim so the LLM can reason over failures (e.g. "vault locked
	 *  → ask user to unlock"). */
	readonly onToolCall: (call: ToolCallRequest) => Promise<ToolResult>;
}): Promise<PlannerLoopResult> {
	const { llm, input, onToolCall } = opts;
	const maxRounds = input.maxRounds ?? DEFAULT_MAX_ROUNDS;
	const toolSpecs = toolsToFunctionSchemas(input.tools);

	const messages: ChatMessage[] = [
		{ role: 'system', content: input.systemPrompt },
		...(input.priorMessages ?? []),
		{ role: 'user', content: input.userPrompt },
	];
	const executedCalls: ExecutedCall[] = [];
	let summary: string | null = null;
	let stopReason: LoopStopReason = 'max-rounds';
	let rounds = 0;
	let promptTokens = 0;
	let completionTokens = 0;
	let compactionsDone = 0;

	while (rounds < maxRounds) {
		rounds++;

		// Context-window compactor (Claude-Code `wU2`): check BEFORE the
		// next LLM call whether the previous round's usage crossed the
		// threshold; if so, replace the middle of `messages` with a
		// compact summary. Fire at most once per loop run so a runaway
		// tool can't keep re-triggering.
		if (input.compactor && compactionsDone === 0) {
			const total = promptTokens + completionTokens;
			const cap = input.compactor.maxContextTokens;
			const threshold = input.compactor.threshold ?? 0.92;
			if (cap > 0 && total > 0 && total / cap >= threshold) {
				const compactResult = await input.compactor.compact(messages);
				if (compactResult.compactedTurns > 0) {
					messages.length = 0;
					for (const m of compactResult.messages) messages.push(m);
					compactionsDone++;
				}
			}
		}

		// Per-round reminder injection: ask the channel for transient
		// hints, wrap each in <reminder> tags, and prepend them as system
		// messages to THIS request only. Nothing gets pushed to `messages`
		// — the reminders are ephemeral steering, not conversation.
		let requestMessages: readonly ChatMessage[] = messages;
		if (input.reminderChannel) {
			const recentCalls = executedCalls.slice(-LOOP_STATE_RECENT_CALLS_WINDOW);
			const state: LoopState = {
				round: rounds,
				toolCallCount: executedCalls.length,
				usage: {
					promptTokens,
					completionTokens,
					totalTokens: promptTokens + completionTokens,
				},
				lastCall: executedCalls[executedCalls.length - 1],
				recentCalls,
				compactionsDone,
			};
			const reminders = input.reminderChannel(state);
			if (reminders.length > 0) {
				const reminderMessages: ChatMessage[] = reminders.map((text) => ({
					role: 'system',
					content: `<reminder>${text}</reminder>`,
				}));
				requestMessages = [...messages, ...reminderMessages];
			}
		}

		const response = await llm.complete({
			messages: requestMessages,
			tools: toolSpecs,
			model: input.model,
			temperature: input.temperature,
		});

		if (response.usage) {
			promptTokens += response.usage.promptTokens;
			completionTokens += response.usage.completionTokens;
		}

		// Append the assistant turn to history before we execute any
		// tools — the LLM needs to see its own prior tool_calls alongside
		// the tool-message results in the next turn.
		messages.push({
			role: 'assistant',
			content: response.content,
			toolCalls: response.toolCalls.length > 0 ? response.toolCalls : undefined,
		});

		if (response.toolCalls.length === 0) {
			summary = response.content;
			stopReason = response.finishReason === 'stop' ? 'assistant-stop' : 'no-tool-calls';
			break;
		}

		// Tool execution.
		//
		// Sequential by default. When the caller supplies `isParallelSafe`
		// and EVERY call in this round passes it, we dispatch in batches
		// of PARALLEL_TOOL_BATCH_SIZE via Promise.all. A single unsafe
		// call in the batch downgrades the whole round to sequential —
		// this preserves semantics for write-after-read chains without
		// pushing the decision onto the model.
		//
		// In both modes we append to `messages` in the LLM's original
		// call order, not completion order, so the debug-log stays linear.
		const calls = response.toolCalls;
		const parallelSafePredicate = input.isParallelSafe;
		const allParallelSafe =
			!!parallelSafePredicate &&
			calls.length > 1 &&
			calls.every((c) => parallelSafePredicate(c.name));

		if (allParallelSafe) {
			for (let i = 0; i < calls.length; i += PARALLEL_TOOL_BATCH_SIZE) {
				const batch = calls.slice(i, i + PARALLEL_TOOL_BATCH_SIZE);
				const results = await Promise.all(batch.map((call) => onToolCall(call)));
				for (let j = 0; j < batch.length; j++) {
					const call = batch[j];
					const result = results[j];
					executedCalls.push({ round: rounds, call, result });
					messages.push({
						role: 'tool',
						toolCallId: call.id,
						content: JSON.stringify({
							success: result.success,
							message: result.message,
							...(result.data !== undefined ? { data: result.data } : {}),
						}),
					});
				}
			}
		} else {
			for (const call of calls) {
				const result = await onToolCall(call);
				executedCalls.push({ round: rounds, call, result });
				messages.push({
					role: 'tool',
					toolCallId: call.id,
					content: JSON.stringify({
						success: result.success,
						message: result.message,
						...(result.data !== undefined ? { data: result.data } : {}),
					}),
				});
			}
		}

		// If the round limit is about to hit, surface it as the reason —
		// the outer consumer can mark the iteration as incomplete.
		if (rounds >= maxRounds) {
			stopReason = 'max-rounds';
			break;
		}
	}

	return {
		rounds,
		executedCalls,
		summary,
		stopReason,
		messages,
		usage: {
			promptTokens,
			completionTokens,
			totalTokens: promptTokens + completionTokens,
		},
	};
}