From 4daca8970b8926e551a1bc640dc484c11da9db5e Mon Sep 17 00:00:00 2001 From: Till JS Date: Mon, 20 Apr 2026 15:31:01 +0200 Subject: [PATCH] feat(shared-ai): runPlannerLoop + compact system prompt for function calling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces the new planner pipeline both the webapp runner and the mana-ai tick will swap onto in the next commits. Additive for now — the legacy buildPlannerPrompt + parsePlannerResponse stay exported so callers can migrate one at a time; they get removed once the last consumer is gone. - planner/loop.ts — runPlannerLoop orchestrates a multi-turn chat against a caller-supplied LlmClient. Tool-calls from the LLM are handed to an onToolCall callback and their results fed back as tool-messages. Parallel tool-calls in one turn execute sequentially to keep the message log linear for debugging. Stops on assistant stop, empty tool_calls, or a hard max-rounds ceiling (default 5). - planner/system-prompt.ts — new buildSystemPrompt. ~40-line German system frame, no tool listing (the SDK-level tools field carries the schemas now), no JSON format example, no "please return JSON" plea. User frame renders mission + linked inputs + last 3 iteration summaries, same as before. - Five test cases covering the loop: immediate stop, single tool call with result feedback, parallel calls execute in order, tool failures propagate as tool-messages the LLM can react to, and maxRounds ceiling fires with the right stopReason. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/shared-ai/src/index.ts | 21 +- packages/shared-ai/src/planner/index.ts | 21 ++ packages/shared-ai/src/planner/loop.test.ts | 200 ++++++++++++++++++ packages/shared-ai/src/planner/loop.ts | 179 ++++++++++++++++ .../shared-ai/src/planner/system-prompt.ts | 117 ++++++++++ 5 files changed, 537 insertions(+), 1 deletion(-) create mode 100644 packages/shared-ai/src/planner/loop.test.ts create mode 100644 packages/shared-ai/src/planner/loop.ts create mode 100644 packages/shared-ai/src/planner/system-prompt.ts diff --git a/packages/shared-ai/src/index.ts b/packages/shared-ai/src/index.ts index 310b06285..06dd5e2ad 100644 --- a/packages/shared-ai/src/index.ts +++ b/packages/shared-ai/src/index.ts @@ -60,12 +60,31 @@ export type { AiPlanInput, AiPlanOutput, AvailableTool, + ChatMessage, + ChatRole, + ExecutedCall, + LlmClient, + LlmCompletionRequest, + LlmCompletionResponse, + LlmFinishReason, + LoopStopReason, ParseResult, PlannedStep, + PlannerLoopInput, + PlannerLoopResult, PlannerMessages, ResolvedInput, + SystemPromptInput, + SystemPromptOutput, + ToolCallRequest, + ToolResult, +} from './planner'; +export { + buildPlannerPrompt, + buildSystemPrompt, + parsePlannerResponse, + runPlannerLoop, } from './planner'; -export { buildPlannerPrompt, parsePlannerResponse } from './planner'; export { AI_PROPOSABLE_TOOL_NAMES, diff --git a/packages/shared-ai/src/planner/index.ts b/packages/shared-ai/src/planner/index.ts index 5f70819f8..5e0924cd3 100644 --- a/packages/shared-ai/src/planner/index.ts +++ b/packages/shared-ai/src/planner/index.ts @@ -3,3 +3,24 @@ export type { PlannerMessages } from './prompt'; export { parsePlannerResponse } from './parser'; export type { ParseResult } from './parser'; export type { AiPlanInput, AiPlanOutput, AvailableTool, PlannedStep, ResolvedInput } from './types'; + +// New function-calling pipeline (replaces the text-JSON planner above +// in Commits 5/6). Additive for now so the old and new callers can +// coexist within the atomic PR. +export { buildSystemPrompt } from './system-prompt'; +export type { SystemPromptInput, SystemPromptOutput } from './system-prompt'; +export { runPlannerLoop } from './loop'; +export type { + ChatMessage, + ChatRole, + ExecutedCall, + LlmClient, + LlmCompletionRequest, + LlmCompletionResponse, + LlmFinishReason, + LoopStopReason, + PlannerLoopInput, + PlannerLoopResult, + ToolCallRequest, + ToolResult, +} from './loop'; diff --git a/packages/shared-ai/src/planner/loop.test.ts b/packages/shared-ai/src/planner/loop.test.ts new file mode 100644 index 000000000..0cbd45d08 --- /dev/null +++ b/packages/shared-ai/src/planner/loop.test.ts @@ -0,0 +1,200 @@ +import { describe, expect, it, vi } from 'vitest'; +import { + runPlannerLoop, + type ChatMessage, + type LlmClient, + type LlmCompletionResponse, + type ToolCallRequest, + type ToolResult, +} from './loop'; +import type { ToolSchema } from '../tools/schemas'; + +/** + * Scriptable mock LLM — each ``enqueue*`` call pushes one planned + * response onto a FIFO. The loop pulls responses in order. If the loop + * asks for more turns than we enqueued, the test fails loudly rather + * than hanging. + */ +class MockLlm implements LlmClient { + private queue: LlmCompletionResponse[] = []; + public calls: Array<{ messages: readonly ChatMessage[]; toolNames: string[] }> = []; + + enqueueToolCalls(calls: Array<{ name: string; args: Record }>): this { + this.queue.push({ + content: null, + toolCalls: calls.map((c, i) => ({ + id: `call_${this.queue.length}_${i}`, + name: c.name, + arguments: c.args, + })), + finishReason: 'tool_calls', + }); + return this; + } + + enqueueStop(content: string | null = null): this { + this.queue.push({ content, toolCalls: [], finishReason: 'stop' }); + return this; + } + + async complete(req: { + messages: readonly ChatMessage[]; + tools: readonly unknown[]; + }): Promise { + // Snapshot at call time — the loop mutates the same array after, + // and we want to assert the state the LLM actually saw. + this.calls.push({ + messages: [...req.messages], + toolNames: (req.tools as Array<{ function: { name: string } }>).map((t) => t.function.name), + }); + const next = this.queue.shift(); + if (!next) throw new Error('MockLlm: no more responses enqueued'); + return next; + } +} + +const tools: ToolSchema[] = [ + { + name: 'list_things', + module: 'test', + description: 'list things', + defaultPolicy: 'auto', + parameters: [], + }, + { + name: 'create_thing', + module: 'test', + description: 'create a thing', + defaultPolicy: 'propose', + parameters: [{ name: 'title', type: 'string', description: 'title', required: true }], + }, +]; + +describe('runPlannerLoop', () => { + it('stops immediately when the LLM emits no tool_calls', async () => { + const llm = new MockLlm().enqueueStop('done'); + const onToolCall = vi.fn(); + const result = await runPlannerLoop({ + llm, + input: { + systemPrompt: 's', + userPrompt: 'u', + tools, + model: 'test/model', + }, + onToolCall, + }); + expect(result.rounds).toBe(1); + expect(result.executedCalls).toHaveLength(0); + expect(result.summary).toBe('done'); + expect(result.stopReason).toBe('assistant-stop'); + expect(onToolCall).not.toHaveBeenCalled(); + }); + + it('executes a single tool call and feeds the result back', async () => { + const llm = new MockLlm() + .enqueueToolCalls([{ name: 'list_things', args: {} }]) + .enqueueStop('all done'); + + const onToolCall = vi.fn( + async (_call: ToolCallRequest): Promise => ({ + success: true, + data: ['a', 'b'], + message: '2 things', + }) + ); + + const result = await runPlannerLoop({ + llm, + input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' }, + onToolCall, + }); + + expect(result.rounds).toBe(2); + expect(result.executedCalls).toHaveLength(1); + expect(result.executedCalls[0].call.name).toBe('list_things'); + expect(result.summary).toBe('all done'); + expect(result.stopReason).toBe('assistant-stop'); + + // Second LLM call must have seen the tool result in its messages. + expect(llm.calls[1].messages).toHaveLength(4); // system + user + assistant + tool + const toolMsg = llm.calls[1].messages[3]; + expect(toolMsg.role).toBe('tool'); + expect(toolMsg.content).toContain('2 things'); + }); + + it('executes parallel tool calls sequentially', async () => { + const llm = new MockLlm() + .enqueueToolCalls([ + { name: 'create_thing', args: { title: 'a' } }, + { name: 'create_thing', args: { title: 'b' } }, + { name: 'create_thing', args: { title: 'c' } }, + ]) + .enqueueStop(); + + const executedInOrder: string[] = []; + const onToolCall = async (call: ToolCallRequest): Promise => { + executedInOrder.push(call.arguments.title as string); + return { success: true, message: 'ok' }; + }; + + const result = await runPlannerLoop({ + llm, + input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' }, + onToolCall, + }); + + expect(executedInOrder).toEqual(['a', 'b', 'c']); + expect(result.executedCalls).toHaveLength(3); + }); + + it('propagates tool failures as tool-messages (LLM can react)', async () => { + const llm = new MockLlm() + .enqueueToolCalls([{ name: 'list_things', args: {} }]) + .enqueueStop('ack'); + + const onToolCall = async (): Promise => ({ + success: false, + message: 'db locked', + }); + + const result = await runPlannerLoop({ + llm, + input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' }, + onToolCall, + }); + + const toolMsg = llm.calls[1].messages[3]; + expect(toolMsg.content).toContain('db locked'); + expect(toolMsg.content).toContain('"success":false'); + expect(result.executedCalls[0].result.success).toBe(false); + }); + + it('honours the maxRounds ceiling', async () => { + const llm = new MockLlm(); + // Seed enough tool-call turns to exceed the cap + for (let i = 0; i < 10; i++) { + llm.enqueueToolCalls([{ name: 'list_things', args: {} }]); + } + const onToolCall = async (): Promise => ({ + success: true, + message: 'ok', + }); + + const result = await runPlannerLoop({ + llm, + input: { + systemPrompt: 's', + userPrompt: 'u', + tools, + model: 'm', + maxRounds: 3, + }, + onToolCall, + }); + + expect(result.rounds).toBe(3); + expect(result.stopReason).toBe('max-rounds'); + expect(result.executedCalls).toHaveLength(3); + }); +}); diff --git a/packages/shared-ai/src/planner/loop.ts b/packages/shared-ai/src/planner/loop.ts new file mode 100644 index 000000000..0a12241e2 --- /dev/null +++ b/packages/shared-ai/src/planner/loop.ts @@ -0,0 +1,179 @@ +/** + * Multi-turn tool-calling loop shared between the webapp runner and the + * server-side mana-ai tick. Replaces the text-JSON planner pipeline: + * we hand the LLM a tool catalog, it emits native tool_calls, we + * execute them and feed the results back as tool-messages until the + * LLM has nothing more to call (or we hit the round budget). + * + * Environment-specific concerns (HTTP transport, auth, actor + * attribution) live in the caller-provided ``LlmClient`` and + * ``onToolCall`` callback. The loop itself stays pure. + */ + +import type { ToolSchema, ToolSpec } from '../tools/function-schema'; +import { toolsToFunctionSchemas } from '../tools/function-schema'; + +// ─── Chat-message contract ────────────────────────────────────────── + +export interface ToolCallRequest { + readonly id: string; + readonly name: string; + readonly arguments: Record; +} + +export interface ToolResult { + readonly success: boolean; + readonly data?: unknown; + readonly message: string; +} + +export type ChatRole = 'system' | 'user' | 'assistant' | 'tool'; + +export interface ChatMessage { + readonly role: ChatRole; + readonly content?: string | null; + readonly toolCalls?: readonly ToolCallRequest[]; + readonly toolCallId?: string; +} + +// ─── LLM client contract ──────────────────────────────────────────── + +export interface LlmCompletionRequest { + readonly messages: readonly ChatMessage[]; + readonly tools: readonly ToolSpec[]; + readonly model: string; + readonly temperature?: number; +} + +export type LlmFinishReason = 'stop' | 'tool_calls' | 'length' | 'content_filter'; + +export interface LlmCompletionResponse { + readonly content: string | null; + readonly toolCalls: readonly ToolCallRequest[]; + readonly finishReason: LlmFinishReason; +} + +export interface LlmClient { + complete(req: LlmCompletionRequest): Promise; +} + +// ─── Loop input / result ──────────────────────────────────────────── + +export interface PlannerLoopInput { + readonly systemPrompt: string; + readonly userPrompt: string; + readonly tools: readonly ToolSchema[]; + readonly model: string; + readonly temperature?: number; + /** Hard ceiling on planner rounds. Each round = one LLM call plus + * whatever tool executions its output triggered. Defaults to 5. */ + readonly maxRounds?: number; +} + +export interface ExecutedCall { + readonly round: number; + readonly call: ToolCallRequest; + readonly result: ToolResult; +} + +export type LoopStopReason = 'assistant-stop' | 'max-rounds' | 'no-tool-calls' | 'llm-error'; + +export interface PlannerLoopResult { + readonly rounds: number; + readonly executedCalls: readonly ExecutedCall[]; + /** Final assistant text when the LLM stopped instead of calling a + * tool. ``null`` when the last turn was a tool-call burst that we + * cut off via round budget. */ + readonly summary: string | null; + readonly stopReason: LoopStopReason; + /** Complete chat history for debug-log capture (system + user + + * every assistant/tool turn). Never synced — contains decrypted + * user content. */ + readonly messages: readonly ChatMessage[]; +} + +// ─── The loop ─────────────────────────────────────────────────────── + +const DEFAULT_MAX_ROUNDS = 5; + +export async function runPlannerLoop(opts: { + readonly llm: LlmClient; + readonly input: PlannerLoopInput; + /** Execute a tool call and return the result that should be fed back + * to the LLM as a tool-message. Must not throw — convert errors to + * ``{ success: false, message }``. The loop injects the result + * verbatim so the LLM can reason over failures (e.g. "vault locked + * → ask user to unlock"). */ + readonly onToolCall: (call: ToolCallRequest) => Promise; +}): Promise { + const { llm, input, onToolCall } = opts; + const maxRounds = input.maxRounds ?? DEFAULT_MAX_ROUNDS; + const toolSpecs = toolsToFunctionSchemas(input.tools); + + const messages: ChatMessage[] = [ + { role: 'system', content: input.systemPrompt }, + { role: 'user', content: input.userPrompt }, + ]; + const executedCalls: ExecutedCall[] = []; + let summary: string | null = null; + let stopReason: LoopStopReason = 'max-rounds'; + let rounds = 0; + + while (rounds < maxRounds) { + rounds++; + const response = await llm.complete({ + messages, + tools: toolSpecs, + model: input.model, + temperature: input.temperature, + }); + + // Append the assistant turn to history before we execute any + // tools — the LLM needs to see its own prior tool_calls alongside + // the tool-message results in the next turn. + messages.push({ + role: 'assistant', + content: response.content, + toolCalls: response.toolCalls.length > 0 ? response.toolCalls : undefined, + }); + + if (response.toolCalls.length === 0) { + summary = response.content; + stopReason = response.finishReason === 'stop' ? 'assistant-stop' : 'no-tool-calls'; + break; + } + + // Execute each tool_call sequentially. Parallel execution is a + // perfectly valid optimisation for pure-read tools but we keep + // order here so the message log tells a linear story when the + // user debugs a failure. + for (const call of response.toolCalls) { + const result = await onToolCall(call); + executedCalls.push({ round: rounds, call, result }); + messages.push({ + role: 'tool', + toolCallId: call.id, + content: JSON.stringify({ + success: result.success, + message: result.message, + ...(result.data !== undefined ? { data: result.data } : {}), + }), + }); + } + + // If the round limit is about to hit, surface it as the reason — + // the outer consumer can mark the iteration as incomplete. + if (rounds >= maxRounds) { + stopReason = 'max-rounds'; + break; + } + } + + return { + rounds, + executedCalls, + summary, + stopReason, + messages, + }; +} diff --git a/packages/shared-ai/src/planner/system-prompt.ts b/packages/shared-ai/src/planner/system-prompt.ts new file mode 100644 index 000000000..67a3a7770 --- /dev/null +++ b/packages/shared-ai/src/planner/system-prompt.ts @@ -0,0 +1,117 @@ +/** + * System-prompt builder for the function-calling planner. + * + * Radically smaller than the pre-migration text-JSON prompt: no tool + * listing (the LLM gets schemas via the native ``tools`` request + * field), no format example (the SDK enforces structured tool_calls), + * no "please return JSON" plea. We just tell the LLM what its job is, + * how to behave in a reasoning loop, and hand over control. + * + * The rendered prompt is ~400 tokens compared to the previous + * ~6000–8000 — big savings on cost and, more importantly, on the + * signal-to-noise ratio the model has to filter. + */ + +import type { Mission } from '../missions/types'; +import type { ResolvedInput } from './types'; + +export interface SystemPromptInput { + readonly mission: Mission; + readonly resolvedInputs: readonly ResolvedInput[]; + /** When set, included verbatim as the agent's persona frame. */ + readonly agentSystemPrompt?: string | null; + /** When set, appended as the agent's persistent memory. */ + readonly agentMemory?: string | null; +} + +export interface SystemPromptOutput { + readonly systemPrompt: string; + readonly userPrompt: string; +} + +export function buildSystemPrompt(input: SystemPromptInput): SystemPromptOutput { + const systemPrompt = buildSystemFrame(input); + const userPrompt = buildUserFrame(input); + return { systemPrompt, userPrompt }; +} + +function buildSystemFrame(input: SystemPromptInput): string { + const agentBlock = renderAgentContext(input); + return [ + 'Du arbeitest im Auftrag des Nutzers an einer langlebigen Mission.', + '', + 'Dein Vorgehen:', + '1. Lies zuerst (Read-Tools liefern dir sofort Ergebnisse) — verstehe den Zustand, bevor du schreibst.', + '2. Führe anschließend die notwendigen Schreib-Tools aus, um das konkrete Ziel umzusetzen.', + '3. Wiederhole bis zu 5 Planungsrunden: nach jedem Tool-Aufruf bekommst du das Ergebnis zurück und kannst daraus den nächsten Schritt ableiten.', + '4. Stoppe, wenn das Ziel erreicht ist oder kein sinnvoller nächster Schritt bleibt.', + '5. Berücksichtige Feedback aus früheren Iterationen — wiederhole keinen Schritt, der zuvor fehlgeschlagen ist, ohne ihn zu ändern.', + '', + 'Wichtig:', + '- Nutze ausschließlich die Tools, die dir als Function-Calls bereitgestellt werden. Nennungen in Prosa werden ignoriert.', + '- Wenn mehrere unabhängige Aktionen anstehen (z. B. "erstelle 8 Fragen"), gib sie in einem einzigen Turn als parallele Tool-Calls aus — das spart Runden.', + '- Wenn ein Tool einen Fehler zurückgibt, reagiere darauf (anderes Tool probieren oder stoppen) — ignoriere Fehler nicht.', + agentBlock, + ] + .filter(Boolean) + .join('\n'); +} + +function renderAgentContext(input: SystemPromptInput): string { + const parts: string[] = []; + if (input.agentSystemPrompt?.trim()) { + parts.push('\n'); + parts.push(input.agentSystemPrompt.trim()); + parts.push(''); + } + if (input.agentMemory?.trim()) { + parts.push('\n'); + parts.push(input.agentMemory.trim()); + parts.push(''); + } + return parts.join('\n'); +} + +function buildUserFrame(input: SystemPromptInput): string { + const { mission, resolvedInputs } = input; + + const inputsBlock = + resolvedInputs.length === 0 + ? '_(keine verlinkten Inputs)_' + : resolvedInputs + .map((r) => `### ${r.module}/${r.table}: ${r.title ?? r.id}\n${r.content}`) + .join('\n\n'); + + const iterationHistory = + mission.iterations.length === 0 + ? '_(erste Iteration)_' + : mission.iterations + .slice(-3) + .map((it) => { + const steps = it.plan.map((s) => ` - [${s.status}] ${s.summary}`).join('\n'); + const feedback = it.userFeedback ? `\n Nutzer-Feedback: ${it.userFeedback}` : ''; + const summary = it.summary ? `\n Summary: ${it.summary}` : ''; + return `**${it.startedAt}** (${it.overallStatus}):${summary}\n${steps}${feedback}`; + }) + .join('\n\n'); + + return [ + `# Mission: ${mission.title}`, + '', + '## Konzept', + mission.conceptMarkdown || '_(leer)_', + '', + '## Konkretes Ziel', + mission.objective || '_(nicht gesetzt)_', + '', + '## Verlinkte Inputs', + inputsBlock, + '', + '## Letzte Iterationen (max. 3)', + iterationHistory, + '', + '---', + '', + 'Beginne jetzt mit der nächsten Iteration. Rufe die nötigen Tools auf.', + ].join('\n'); +}