feat(shared-ai): runPlannerLoop + compact system prompt for function calling

Introduces the new planner pipeline both the webapp runner and the mana-ai tick will swap onto in the next commits. Additive for now — the legacy buildPlannerPrompt + parsePlannerResponse stay exported so callers can migrate one at a time; they get removed once the last consumer is gone. - planner/loop.ts — runPlannerLoop orchestrates a multi-turn chat against a caller-supplied LlmClient. Tool-calls from the LLM are handed to an onToolCall callback and their results fed back as tool-messages. Parallel tool-calls in one turn execute sequentially to keep the message log linear for debugging. Stops on assistant stop, empty tool_calls, or a hard max-rounds ceiling (default 5). - planner/system-prompt.ts — new buildSystemPrompt. ~40-line German system frame, no tool listing (the SDK-level tools field carries the schemas now), no JSON format example, no "please return JSON" plea. User frame renders mission + linked inputs + last 3 iteration summaries, same as before. - Five test cases covering the loop: immediate stop, single tool call with result feedback, parallel calls execute in order, tool failures propagate as tool-messages the LLM can react to, and maxRounds ceiling fires with the right stopReason. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 00:41:09 +02:00 · 2026-04-20 15:31:01 +02:00 · 2026-04-20 15:31:01 +02:00 · 4daca8970b
commit 4daca8970b
parent 2cf89ce26a
5 changed files with 537 additions and 1 deletions
--- a/packages/shared-ai/src/index.ts
+++ b/packages/shared-ai/src/index.ts
@ -60,12 +60,31 @@ export type {
 	AiPlanInput,
 	AiPlanOutput,
 	AvailableTool,
+	ChatMessage,
+	ChatRole,
+	ExecutedCall,
+	LlmClient,
+	LlmCompletionRequest,
+	LlmCompletionResponse,
+	LlmFinishReason,
+	LoopStopReason,
 	ParseResult,
 	PlannedStep,
+	PlannerLoopInput,
+	PlannerLoopResult,
 	PlannerMessages,
 	ResolvedInput,
+	SystemPromptInput,
+	SystemPromptOutput,
+	ToolCallRequest,
+	ToolResult,
+} from './planner';
+export {
+	buildPlannerPrompt,
+	buildSystemPrompt,
+	parsePlannerResponse,
+	runPlannerLoop,
 } from './planner';
-export { buildPlannerPrompt, parsePlannerResponse } from './planner';

 export {
 	AI_PROPOSABLE_TOOL_NAMES,
--- a/packages/shared-ai/src/planner/index.ts
+++ b/packages/shared-ai/src/planner/index.ts
@ -3,3 +3,24 @@ export type { PlannerMessages } from './prompt';
 export { parsePlannerResponse } from './parser';
 export type { ParseResult } from './parser';
 export type { AiPlanInput, AiPlanOutput, AvailableTool, PlannedStep, ResolvedInput } from './types';
+
+// New function-calling pipeline (replaces the text-JSON planner above
+// in Commits 5/6). Additive for now so the old and new callers can
+// coexist within the atomic PR.
+export { buildSystemPrompt } from './system-prompt';
+export type { SystemPromptInput, SystemPromptOutput } from './system-prompt';
+export { runPlannerLoop } from './loop';
+export type {
+	ChatMessage,
+	ChatRole,
+	ExecutedCall,
+	LlmClient,
+	LlmCompletionRequest,
+	LlmCompletionResponse,
+	LlmFinishReason,
+	LoopStopReason,
+	PlannerLoopInput,
+	PlannerLoopResult,
+	ToolCallRequest,
+	ToolResult,
+} from './loop';
--- a/packages/shared-ai/src/planner/loop.test.ts
+++ b/packages/shared-ai/src/planner/loop.test.ts
@ -0,0 +1,200 @@
+import { describe, expect, it, vi } from 'vitest';
+import {
+	runPlannerLoop,
+	type ChatMessage,
+	type LlmClient,
+	type LlmCompletionResponse,
+	type ToolCallRequest,
+	type ToolResult,
+} from './loop';
+import type { ToolSchema } from '../tools/schemas';
+
+/**
+ * Scriptable mock LLM — each ``enqueue*`` call pushes one planned
+ * response onto a FIFO. The loop pulls responses in order. If the loop
+ * asks for more turns than we enqueued, the test fails loudly rather
+ * than hanging.
+ */
+class MockLlm implements LlmClient {
+	private queue: LlmCompletionResponse[] = [];
+	public calls: Array<{ messages: readonly ChatMessage[]; toolNames: string[] }> = [];
+
+	enqueueToolCalls(calls: Array<{ name: string; args: Record<string, unknown> }>): this {
+		this.queue.push({
+			content: null,
+			toolCalls: calls.map((c, i) => ({
+				id: `call_${this.queue.length}_${i}`,
+				name: c.name,
+				arguments: c.args,
+			})),
+			finishReason: 'tool_calls',
+		});
+		return this;
+	}
+
+	enqueueStop(content: string | null = null): this {
+		this.queue.push({ content, toolCalls: [], finishReason: 'stop' });
+		return this;
+	}
+
+	async complete(req: {
+		messages: readonly ChatMessage[];
+		tools: readonly unknown[];
+	}): Promise<LlmCompletionResponse> {
+		// Snapshot at call time — the loop mutates the same array after,
+		// and we want to assert the state the LLM actually saw.
+		this.calls.push({
+			messages: [...req.messages],
+			toolNames: (req.tools as Array<{ function: { name: string } }>).map((t) => t.function.name),
+		});
+		const next = this.queue.shift();
+		if (!next) throw new Error('MockLlm: no more responses enqueued');
+		return next;
+	}
+}
+
+const tools: ToolSchema[] = [
+	{
+		name: 'list_things',
+		module: 'test',
+		description: 'list things',
+		defaultPolicy: 'auto',
+		parameters: [],
+	},
+	{
+		name: 'create_thing',
+		module: 'test',
+		description: 'create a thing',
+		defaultPolicy: 'propose',
+		parameters: [{ name: 'title', type: 'string', description: 'title', required: true }],
+	},
+];
+
+describe('runPlannerLoop', () => {
+	it('stops immediately when the LLM emits no tool_calls', async () => {
+		const llm = new MockLlm().enqueueStop('done');
+		const onToolCall = vi.fn();
+		const result = await runPlannerLoop({
+			llm,
+			input: {
+				systemPrompt: 's',
+				userPrompt: 'u',
+				tools,
+				model: 'test/model',
+			},
+			onToolCall,
+		});
+		expect(result.rounds).toBe(1);
+		expect(result.executedCalls).toHaveLength(0);
+		expect(result.summary).toBe('done');
+		expect(result.stopReason).toBe('assistant-stop');
+		expect(onToolCall).not.toHaveBeenCalled();
+	});
+
+	it('executes a single tool call and feeds the result back', async () => {
+		const llm = new MockLlm()
+			.enqueueToolCalls([{ name: 'list_things', args: {} }])
+			.enqueueStop('all done');
+
+		const onToolCall = vi.fn(
+			async (_call: ToolCallRequest): Promise<ToolResult> => ({
+				success: true,
+				data: ['a', 'b'],
+				message: '2 things',
+			})
+		);
+
+		const result = await runPlannerLoop({
+			llm,
+			input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
+			onToolCall,
+		});
+
+		expect(result.rounds).toBe(2);
+		expect(result.executedCalls).toHaveLength(1);
+		expect(result.executedCalls[0].call.name).toBe('list_things');
+		expect(result.summary).toBe('all done');
+		expect(result.stopReason).toBe('assistant-stop');
+
+		// Second LLM call must have seen the tool result in its messages.
+		expect(llm.calls[1].messages).toHaveLength(4); // system + user + assistant + tool
+		const toolMsg = llm.calls[1].messages[3];
+		expect(toolMsg.role).toBe('tool');
+		expect(toolMsg.content).toContain('2 things');
+	});
+
+	it('executes parallel tool calls sequentially', async () => {
+		const llm = new MockLlm()
+			.enqueueToolCalls([
+				{ name: 'create_thing', args: { title: 'a' } },
+				{ name: 'create_thing', args: { title: 'b' } },
+				{ name: 'create_thing', args: { title: 'c' } },
+			])
+			.enqueueStop();
+
+		const executedInOrder: string[] = [];
+		const onToolCall = async (call: ToolCallRequest): Promise<ToolResult> => {
+			executedInOrder.push(call.arguments.title as string);
+			return { success: true, message: 'ok' };
+		};
+
+		const result = await runPlannerLoop({
+			llm,
+			input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
+			onToolCall,
+		});
+
+		expect(executedInOrder).toEqual(['a', 'b', 'c']);
+		expect(result.executedCalls).toHaveLength(3);
+	});
+
+	it('propagates tool failures as tool-messages (LLM can react)', async () => {
+		const llm = new MockLlm()
+			.enqueueToolCalls([{ name: 'list_things', args: {} }])
+			.enqueueStop('ack');
+
+		const onToolCall = async (): Promise<ToolResult> => ({
+			success: false,
+			message: 'db locked',
+		});
+
+		const result = await runPlannerLoop({
+			llm,
+			input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
+			onToolCall,
+		});
+
+		const toolMsg = llm.calls[1].messages[3];
+		expect(toolMsg.content).toContain('db locked');
+		expect(toolMsg.content).toContain('"success":false');
+		expect(result.executedCalls[0].result.success).toBe(false);
+	});
+
+	it('honours the maxRounds ceiling', async () => {
+		const llm = new MockLlm();
+		// Seed enough tool-call turns to exceed the cap
+		for (let i = 0; i < 10; i++) {
+			llm.enqueueToolCalls([{ name: 'list_things', args: {} }]);
+		}
+		const onToolCall = async (): Promise<ToolResult> => ({
+			success: true,
+			message: 'ok',
+		});
+
+		const result = await runPlannerLoop({
+			llm,
+			input: {
+				systemPrompt: 's',
+				userPrompt: 'u',
+				tools,
+				model: 'm',
+				maxRounds: 3,
+			},
+			onToolCall,
+		});
+
+		expect(result.rounds).toBe(3);
+		expect(result.stopReason).toBe('max-rounds');
+		expect(result.executedCalls).toHaveLength(3);
+	});
+});
--- a/packages/shared-ai/src/planner/loop.ts
+++ b/packages/shared-ai/src/planner/loop.ts
@ -0,0 +1,179 @@
+/**
+ * Multi-turn tool-calling loop shared between the webapp runner and the
+ * server-side mana-ai tick. Replaces the text-JSON planner pipeline:
+ * we hand the LLM a tool catalog, it emits native tool_calls, we
+ * execute them and feed the results back as tool-messages until the
+ * LLM has nothing more to call (or we hit the round budget).
+ *
+ * Environment-specific concerns (HTTP transport, auth, actor
+ * attribution) live in the caller-provided ``LlmClient`` and
+ * ``onToolCall`` callback. The loop itself stays pure.
+ */
+
+import type { ToolSchema, ToolSpec } from '../tools/function-schema';
+import { toolsToFunctionSchemas } from '../tools/function-schema';
+
+// ─── Chat-message contract ──────────────────────────────────────────
+
+export interface ToolCallRequest {
+	readonly id: string;
+	readonly name: string;
+	readonly arguments: Record<string, unknown>;
+}
+
+export interface ToolResult {
+	readonly success: boolean;
+	readonly data?: unknown;
+	readonly message: string;
+}
+
+export type ChatRole = 'system' | 'user' | 'assistant' | 'tool';
+
+export interface ChatMessage {
+	readonly role: ChatRole;
+	readonly content?: string | null;
+	readonly toolCalls?: readonly ToolCallRequest[];
+	readonly toolCallId?: string;
+}
+
+// ─── LLM client contract ────────────────────────────────────────────
+
+export interface LlmCompletionRequest {
+	readonly messages: readonly ChatMessage[];
+	readonly tools: readonly ToolSpec[];
+	readonly model: string;
+	readonly temperature?: number;
+}
+
+export type LlmFinishReason = 'stop' | 'tool_calls' | 'length' | 'content_filter';
+
+export interface LlmCompletionResponse {
+	readonly content: string | null;
+	readonly toolCalls: readonly ToolCallRequest[];
+	readonly finishReason: LlmFinishReason;
+}
+
+export interface LlmClient {
+	complete(req: LlmCompletionRequest): Promise<LlmCompletionResponse>;
+}
+
+// ─── Loop input / result ────────────────────────────────────────────
+
+export interface PlannerLoopInput {
+	readonly systemPrompt: string;
+	readonly userPrompt: string;
+	readonly tools: readonly ToolSchema[];
+	readonly model: string;
+	readonly temperature?: number;
+	/** Hard ceiling on planner rounds. Each round = one LLM call plus
+	 *  whatever tool executions its output triggered. Defaults to 5. */
+	readonly maxRounds?: number;
+}
+
+export interface ExecutedCall {
+	readonly round: number;
+	readonly call: ToolCallRequest;
+	readonly result: ToolResult;
+}
+
+export type LoopStopReason = 'assistant-stop' | 'max-rounds' | 'no-tool-calls' | 'llm-error';
+
+export interface PlannerLoopResult {
+	readonly rounds: number;
+	readonly executedCalls: readonly ExecutedCall[];
+	/** Final assistant text when the LLM stopped instead of calling a
+	 *  tool. ``null`` when the last turn was a tool-call burst that we
+	 *  cut off via round budget. */
+	readonly summary: string | null;
+	readonly stopReason: LoopStopReason;
+	/** Complete chat history for debug-log capture (system + user +
+	 *  every assistant/tool turn). Never synced — contains decrypted
+	 *  user content. */
+	readonly messages: readonly ChatMessage[];
+}
+
+// ─── The loop ───────────────────────────────────────────────────────
+
+const DEFAULT_MAX_ROUNDS = 5;
+
+export async function runPlannerLoop(opts: {
+	readonly llm: LlmClient;
+	readonly input: PlannerLoopInput;
+	/** Execute a tool call and return the result that should be fed back
+	 *  to the LLM as a tool-message. Must not throw — convert errors to
+	 *  ``{ success: false, message }``. The loop injects the result
+	 *  verbatim so the LLM can reason over failures (e.g. "vault locked
+	 *  → ask user to unlock"). */
+	readonly onToolCall: (call: ToolCallRequest) => Promise<ToolResult>;
+}): Promise<PlannerLoopResult> {
+	const { llm, input, onToolCall } = opts;
+	const maxRounds = input.maxRounds ?? DEFAULT_MAX_ROUNDS;
+	const toolSpecs = toolsToFunctionSchemas(input.tools);
+
+	const messages: ChatMessage[] = [
+		{ role: 'system', content: input.systemPrompt },
+		{ role: 'user', content: input.userPrompt },
+	];
+	const executedCalls: ExecutedCall[] = [];
+	let summary: string | null = null;
+	let stopReason: LoopStopReason = 'max-rounds';
+	let rounds = 0;
+
+	while (rounds < maxRounds) {
+		rounds++;
+		const response = await llm.complete({
+			messages,
+			tools: toolSpecs,
+			model: input.model,
+			temperature: input.temperature,
+		});
+
+		// Append the assistant turn to history before we execute any
+		// tools — the LLM needs to see its own prior tool_calls alongside
+		// the tool-message results in the next turn.
+		messages.push({
+			role: 'assistant',
+			content: response.content,
+			toolCalls: response.toolCalls.length > 0 ? response.toolCalls : undefined,
+		});
+
+		if (response.toolCalls.length === 0) {
+			summary = response.content;
+			stopReason = response.finishReason === 'stop' ? 'assistant-stop' : 'no-tool-calls';
+			break;
+		}
+
+		// Execute each tool_call sequentially. Parallel execution is a
+		// perfectly valid optimisation for pure-read tools but we keep
+		// order here so the message log tells a linear story when the
+		// user debugs a failure.
+		for (const call of response.toolCalls) {
+			const result = await onToolCall(call);
+			executedCalls.push({ round: rounds, call, result });
+			messages.push({
+				role: 'tool',
+				toolCallId: call.id,
+				content: JSON.stringify({
+					success: result.success,
+					message: result.message,
+					...(result.data !== undefined ? { data: result.data } : {}),
+				}),
+			});
+		}
+
+		// If the round limit is about to hit, surface it as the reason —
+		// the outer consumer can mark the iteration as incomplete.
+		if (rounds >= maxRounds) {
+			stopReason = 'max-rounds';
+			break;
+		}
+	}
+
+	return {
+		rounds,
+		executedCalls,
+		summary,
+		stopReason,
+		messages,
+	};
+}
--- a/packages/shared-ai/src/planner/system-prompt.ts
+++ b/packages/shared-ai/src/planner/system-prompt.ts
@ -0,0 +1,117 @@
+/**
+ * System-prompt builder for the function-calling planner.
+ *
+ * Radically smaller than the pre-migration text-JSON prompt: no tool
+ * listing (the LLM gets schemas via the native ``tools`` request
+ * field), no format example (the SDK enforces structured tool_calls),
+ * no "please return JSON" plea. We just tell the LLM what its job is,
+ * how to behave in a reasoning loop, and hand over control.
+ *
+ * The rendered prompt is ~400 tokens compared to the previous
+ * ~6000–8000 — big savings on cost and, more importantly, on the
+ * signal-to-noise ratio the model has to filter.
+ */
+
+import type { Mission } from '../missions/types';
+import type { ResolvedInput } from './types';
+
+export interface SystemPromptInput {
+	readonly mission: Mission;
+	readonly resolvedInputs: readonly ResolvedInput[];
+	/** When set, included verbatim as the agent's persona frame. */
+	readonly agentSystemPrompt?: string | null;
+	/** When set, appended as the agent's persistent memory. */
+	readonly agentMemory?: string | null;
+}
+
+export interface SystemPromptOutput {
+	readonly systemPrompt: string;
+	readonly userPrompt: string;
+}
+
+export function buildSystemPrompt(input: SystemPromptInput): SystemPromptOutput {
+	const systemPrompt = buildSystemFrame(input);
+	const userPrompt = buildUserFrame(input);
+	return { systemPrompt, userPrompt };
+}
+
+function buildSystemFrame(input: SystemPromptInput): string {
+	const agentBlock = renderAgentContext(input);
+	return [
+		'Du arbeitest im Auftrag des Nutzers an einer langlebigen Mission.',
+		'',
+		'Dein Vorgehen:',
+		'1. Lies zuerst (Read-Tools liefern dir sofort Ergebnisse) — verstehe den Zustand, bevor du schreibst.',
+		'2. Führe anschließend die notwendigen Schreib-Tools aus, um das konkrete Ziel umzusetzen.',
+		'3. Wiederhole bis zu 5 Planungsrunden: nach jedem Tool-Aufruf bekommst du das Ergebnis zurück und kannst daraus den nächsten Schritt ableiten.',
+		'4. Stoppe, wenn das Ziel erreicht ist oder kein sinnvoller nächster Schritt bleibt.',
+		'5. Berücksichtige Feedback aus früheren Iterationen — wiederhole keinen Schritt, der zuvor fehlgeschlagen ist, ohne ihn zu ändern.',
+		'',
+		'Wichtig:',
+		'- Nutze ausschließlich die Tools, die dir als Function-Calls bereitgestellt werden. Nennungen in Prosa werden ignoriert.',
+		'- Wenn mehrere unabhängige Aktionen anstehen (z. B. "erstelle 8 Fragen"), gib sie in einem einzigen Turn als parallele Tool-Calls aus — das spart Runden.',
+		'- Wenn ein Tool einen Fehler zurückgibt, reagiere darauf (anderes Tool probieren oder stoppen) — ignoriere Fehler nicht.',
+		agentBlock,
+	]
+		.filter(Boolean)
+		.join('\n');
+}
+
+function renderAgentContext(input: SystemPromptInput): string {
+	const parts: string[] = [];
+	if (input.agentSystemPrompt?.trim()) {
+		parts.push('\n<agent_persona>');
+		parts.push(input.agentSystemPrompt.trim());
+		parts.push('</agent_persona>');
+	}
+	if (input.agentMemory?.trim()) {
+		parts.push('\n<agent_memory>');
+		parts.push(input.agentMemory.trim());
+		parts.push('</agent_memory>');
+	}
+	return parts.join('\n');
+}
+
+function buildUserFrame(input: SystemPromptInput): string {
+	const { mission, resolvedInputs } = input;
+
+	const inputsBlock =
+		resolvedInputs.length === 0
+			? '_(keine verlinkten Inputs)_'
+			: resolvedInputs
+					.map((r) => `### ${r.module}/${r.table}: ${r.title ?? r.id}\n${r.content}`)
+					.join('\n\n');
+
+	const iterationHistory =
+		mission.iterations.length === 0
+			? '_(erste Iteration)_'
+			: mission.iterations
+					.slice(-3)
+					.map((it) => {
+						const steps = it.plan.map((s) => `  - [${s.status}] ${s.summary}`).join('\n');
+						const feedback = it.userFeedback ? `\n  Nutzer-Feedback: ${it.userFeedback}` : '';
+						const summary = it.summary ? `\n  Summary: ${it.summary}` : '';
+						return `**${it.startedAt}** (${it.overallStatus}):${summary}\n${steps}${feedback}`;
+					})
+					.join('\n\n');
+
+	return [
+		`# Mission: ${mission.title}`,
+		'',
+		'## Konzept',
+		mission.conceptMarkdown || '_(leer)_',
+		'',
+		'## Konkretes Ziel',
+		mission.objective || '_(nicht gesetzt)_',
+		'',
+		'## Verlinkte Inputs',
+		inputsBlock,
+		'',
+		'## Letzte Iterationen (max. 3)',
+		iterationHistory,
+		'',
+		'---',
+		'',
+		'Beginne jetzt mit der nächsten Iteration. Rufe die nötigen Tools auf.',
+	].join('\n');
+}