feat(shared-ai): runPlannerLoop + compact system prompt for function calling

Introduces the new planner pipeline both the webapp runner and the
mana-ai tick will swap onto in the next commits. Additive for now —
the legacy buildPlannerPrompt + parsePlannerResponse stay exported so
callers can migrate one at a time; they get removed once the last
consumer is gone.

- planner/loop.ts — runPlannerLoop orchestrates a multi-turn chat
  against a caller-supplied LlmClient. Tool-calls from the LLM are
  handed to an onToolCall callback and their results fed back as
  tool-messages. Parallel tool-calls in one turn execute sequentially
  to keep the message log linear for debugging. Stops on assistant
  stop, empty tool_calls, or a hard max-rounds ceiling (default 5).
- planner/system-prompt.ts — new buildSystemPrompt. ~40-line German
  system frame, no tool listing (the SDK-level tools field carries
  the schemas now), no JSON format example, no "please return JSON"
  plea. User frame renders mission + linked inputs + last 3
  iteration summaries, same as before.
- Five test cases covering the loop: immediate stop, single tool
  call with result feedback, parallel calls execute in order, tool
  failures propagate as tool-messages the LLM can react to, and
  maxRounds ceiling fires with the right stopReason.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-20 15:31:01 +02:00
parent 2cf89ce26a
commit 4daca8970b
5 changed files with 537 additions and 1 deletions

View file

@ -60,12 +60,31 @@ export type {
AiPlanInput,
AiPlanOutput,
AvailableTool,
ChatMessage,
ChatRole,
ExecutedCall,
LlmClient,
LlmCompletionRequest,
LlmCompletionResponse,
LlmFinishReason,
LoopStopReason,
ParseResult,
PlannedStep,
PlannerLoopInput,
PlannerLoopResult,
PlannerMessages,
ResolvedInput,
SystemPromptInput,
SystemPromptOutput,
ToolCallRequest,
ToolResult,
} from './planner';
export {
buildPlannerPrompt,
buildSystemPrompt,
parsePlannerResponse,
runPlannerLoop,
} from './planner';
export { buildPlannerPrompt, parsePlannerResponse } from './planner';
export {
AI_PROPOSABLE_TOOL_NAMES,

View file

@ -3,3 +3,24 @@ export type { PlannerMessages } from './prompt';
export { parsePlannerResponse } from './parser';
export type { ParseResult } from './parser';
export type { AiPlanInput, AiPlanOutput, AvailableTool, PlannedStep, ResolvedInput } from './types';
// New function-calling pipeline (replaces the text-JSON planner above
// in Commits 5/6). Additive for now so the old and new callers can
// coexist within the atomic PR.
export { buildSystemPrompt } from './system-prompt';
export type { SystemPromptInput, SystemPromptOutput } from './system-prompt';
export { runPlannerLoop } from './loop';
export type {
ChatMessage,
ChatRole,
ExecutedCall,
LlmClient,
LlmCompletionRequest,
LlmCompletionResponse,
LlmFinishReason,
LoopStopReason,
PlannerLoopInput,
PlannerLoopResult,
ToolCallRequest,
ToolResult,
} from './loop';

View file

@ -0,0 +1,200 @@
import { describe, expect, it, vi } from 'vitest';
import {
runPlannerLoop,
type ChatMessage,
type LlmClient,
type LlmCompletionResponse,
type ToolCallRequest,
type ToolResult,
} from './loop';
import type { ToolSchema } from '../tools/schemas';
/**
* Scriptable mock LLM each ``enqueue*`` call pushes one planned
* response onto a FIFO. The loop pulls responses in order. If the loop
* asks for more turns than we enqueued, the test fails loudly rather
* than hanging.
*/
class MockLlm implements LlmClient {
private queue: LlmCompletionResponse[] = [];
public calls: Array<{ messages: readonly ChatMessage[]; toolNames: string[] }> = [];
enqueueToolCalls(calls: Array<{ name: string; args: Record<string, unknown> }>): this {
this.queue.push({
content: null,
toolCalls: calls.map((c, i) => ({
id: `call_${this.queue.length}_${i}`,
name: c.name,
arguments: c.args,
})),
finishReason: 'tool_calls',
});
return this;
}
enqueueStop(content: string | null = null): this {
this.queue.push({ content, toolCalls: [], finishReason: 'stop' });
return this;
}
async complete(req: {
messages: readonly ChatMessage[];
tools: readonly unknown[];
}): Promise<LlmCompletionResponse> {
// Snapshot at call time — the loop mutates the same array after,
// and we want to assert the state the LLM actually saw.
this.calls.push({
messages: [...req.messages],
toolNames: (req.tools as Array<{ function: { name: string } }>).map((t) => t.function.name),
});
const next = this.queue.shift();
if (!next) throw new Error('MockLlm: no more responses enqueued');
return next;
}
}
const tools: ToolSchema[] = [
{
name: 'list_things',
module: 'test',
description: 'list things',
defaultPolicy: 'auto',
parameters: [],
},
{
name: 'create_thing',
module: 'test',
description: 'create a thing',
defaultPolicy: 'propose',
parameters: [{ name: 'title', type: 'string', description: 'title', required: true }],
},
];
describe('runPlannerLoop', () => {
it('stops immediately when the LLM emits no tool_calls', async () => {
const llm = new MockLlm().enqueueStop('done');
const onToolCall = vi.fn();
const result = await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'test/model',
},
onToolCall,
});
expect(result.rounds).toBe(1);
expect(result.executedCalls).toHaveLength(0);
expect(result.summary).toBe('done');
expect(result.stopReason).toBe('assistant-stop');
expect(onToolCall).not.toHaveBeenCalled();
});
it('executes a single tool call and feeds the result back', async () => {
const llm = new MockLlm()
.enqueueToolCalls([{ name: 'list_things', args: {} }])
.enqueueStop('all done');
const onToolCall = vi.fn(
async (_call: ToolCallRequest): Promise<ToolResult> => ({
success: true,
data: ['a', 'b'],
message: '2 things',
})
);
const result = await runPlannerLoop({
llm,
input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
onToolCall,
});
expect(result.rounds).toBe(2);
expect(result.executedCalls).toHaveLength(1);
expect(result.executedCalls[0].call.name).toBe('list_things');
expect(result.summary).toBe('all done');
expect(result.stopReason).toBe('assistant-stop');
// Second LLM call must have seen the tool result in its messages.
expect(llm.calls[1].messages).toHaveLength(4); // system + user + assistant + tool
const toolMsg = llm.calls[1].messages[3];
expect(toolMsg.role).toBe('tool');
expect(toolMsg.content).toContain('2 things');
});
it('executes parallel tool calls sequentially', async () => {
const llm = new MockLlm()
.enqueueToolCalls([
{ name: 'create_thing', args: { title: 'a' } },
{ name: 'create_thing', args: { title: 'b' } },
{ name: 'create_thing', args: { title: 'c' } },
])
.enqueueStop();
const executedInOrder: string[] = [];
const onToolCall = async (call: ToolCallRequest): Promise<ToolResult> => {
executedInOrder.push(call.arguments.title as string);
return { success: true, message: 'ok' };
};
const result = await runPlannerLoop({
llm,
input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
onToolCall,
});
expect(executedInOrder).toEqual(['a', 'b', 'c']);
expect(result.executedCalls).toHaveLength(3);
});
it('propagates tool failures as tool-messages (LLM can react)', async () => {
const llm = new MockLlm()
.enqueueToolCalls([{ name: 'list_things', args: {} }])
.enqueueStop('ack');
const onToolCall = async (): Promise<ToolResult> => ({
success: false,
message: 'db locked',
});
const result = await runPlannerLoop({
llm,
input: { systemPrompt: 's', userPrompt: 'u', tools, model: 'm' },
onToolCall,
});
const toolMsg = llm.calls[1].messages[3];
expect(toolMsg.content).toContain('db locked');
expect(toolMsg.content).toContain('"success":false');
expect(result.executedCalls[0].result.success).toBe(false);
});
it('honours the maxRounds ceiling', async () => {
const llm = new MockLlm();
// Seed enough tool-call turns to exceed the cap
for (let i = 0; i < 10; i++) {
llm.enqueueToolCalls([{ name: 'list_things', args: {} }]);
}
const onToolCall = async (): Promise<ToolResult> => ({
success: true,
message: 'ok',
});
const result = await runPlannerLoop({
llm,
input: {
systemPrompt: 's',
userPrompt: 'u',
tools,
model: 'm',
maxRounds: 3,
},
onToolCall,
});
expect(result.rounds).toBe(3);
expect(result.stopReason).toBe('max-rounds');
expect(result.executedCalls).toHaveLength(3);
});
});

View file

@ -0,0 +1,179 @@
/**
* Multi-turn tool-calling loop shared between the webapp runner and the
* server-side mana-ai tick. Replaces the text-JSON planner pipeline:
* we hand the LLM a tool catalog, it emits native tool_calls, we
* execute them and feed the results back as tool-messages until the
* LLM has nothing more to call (or we hit the round budget).
*
* Environment-specific concerns (HTTP transport, auth, actor
* attribution) live in the caller-provided ``LlmClient`` and
* ``onToolCall`` callback. The loop itself stays pure.
*/
import type { ToolSchema, ToolSpec } from '../tools/function-schema';
import { toolsToFunctionSchemas } from '../tools/function-schema';
// ─── Chat-message contract ──────────────────────────────────────────
export interface ToolCallRequest {
readonly id: string;
readonly name: string;
readonly arguments: Record<string, unknown>;
}
export interface ToolResult {
readonly success: boolean;
readonly data?: unknown;
readonly message: string;
}
export type ChatRole = 'system' | 'user' | 'assistant' | 'tool';
export interface ChatMessage {
readonly role: ChatRole;
readonly content?: string | null;
readonly toolCalls?: readonly ToolCallRequest[];
readonly toolCallId?: string;
}
// ─── LLM client contract ────────────────────────────────────────────
export interface LlmCompletionRequest {
readonly messages: readonly ChatMessage[];
readonly tools: readonly ToolSpec[];
readonly model: string;
readonly temperature?: number;
}
export type LlmFinishReason = 'stop' | 'tool_calls' | 'length' | 'content_filter';
export interface LlmCompletionResponse {
readonly content: string | null;
readonly toolCalls: readonly ToolCallRequest[];
readonly finishReason: LlmFinishReason;
}
export interface LlmClient {
complete(req: LlmCompletionRequest): Promise<LlmCompletionResponse>;
}
// ─── Loop input / result ────────────────────────────────────────────
export interface PlannerLoopInput {
readonly systemPrompt: string;
readonly userPrompt: string;
readonly tools: readonly ToolSchema[];
readonly model: string;
readonly temperature?: number;
/** Hard ceiling on planner rounds. Each round = one LLM call plus
* whatever tool executions its output triggered. Defaults to 5. */
readonly maxRounds?: number;
}
export interface ExecutedCall {
readonly round: number;
readonly call: ToolCallRequest;
readonly result: ToolResult;
}
export type LoopStopReason = 'assistant-stop' | 'max-rounds' | 'no-tool-calls' | 'llm-error';
export interface PlannerLoopResult {
readonly rounds: number;
readonly executedCalls: readonly ExecutedCall[];
/** Final assistant text when the LLM stopped instead of calling a
* tool. ``null`` when the last turn was a tool-call burst that we
* cut off via round budget. */
readonly summary: string | null;
readonly stopReason: LoopStopReason;
/** Complete chat history for debug-log capture (system + user +
* every assistant/tool turn). Never synced contains decrypted
* user content. */
readonly messages: readonly ChatMessage[];
}
// ─── The loop ───────────────────────────────────────────────────────
const DEFAULT_MAX_ROUNDS = 5;
export async function runPlannerLoop(opts: {
readonly llm: LlmClient;
readonly input: PlannerLoopInput;
/** Execute a tool call and return the result that should be fed back
* to the LLM as a tool-message. Must not throw convert errors to
* ``{ success: false, message }``. The loop injects the result
* verbatim so the LLM can reason over failures (e.g. "vault locked
* ask user to unlock"). */
readonly onToolCall: (call: ToolCallRequest) => Promise<ToolResult>;
}): Promise<PlannerLoopResult> {
const { llm, input, onToolCall } = opts;
const maxRounds = input.maxRounds ?? DEFAULT_MAX_ROUNDS;
const toolSpecs = toolsToFunctionSchemas(input.tools);
const messages: ChatMessage[] = [
{ role: 'system', content: input.systemPrompt },
{ role: 'user', content: input.userPrompt },
];
const executedCalls: ExecutedCall[] = [];
let summary: string | null = null;
let stopReason: LoopStopReason = 'max-rounds';
let rounds = 0;
while (rounds < maxRounds) {
rounds++;
const response = await llm.complete({
messages,
tools: toolSpecs,
model: input.model,
temperature: input.temperature,
});
// Append the assistant turn to history before we execute any
// tools — the LLM needs to see its own prior tool_calls alongside
// the tool-message results in the next turn.
messages.push({
role: 'assistant',
content: response.content,
toolCalls: response.toolCalls.length > 0 ? response.toolCalls : undefined,
});
if (response.toolCalls.length === 0) {
summary = response.content;
stopReason = response.finishReason === 'stop' ? 'assistant-stop' : 'no-tool-calls';
break;
}
// Execute each tool_call sequentially. Parallel execution is a
// perfectly valid optimisation for pure-read tools but we keep
// order here so the message log tells a linear story when the
// user debugs a failure.
for (const call of response.toolCalls) {
const result = await onToolCall(call);
executedCalls.push({ round: rounds, call, result });
messages.push({
role: 'tool',
toolCallId: call.id,
content: JSON.stringify({
success: result.success,
message: result.message,
...(result.data !== undefined ? { data: result.data } : {}),
}),
});
}
// If the round limit is about to hit, surface it as the reason —
// the outer consumer can mark the iteration as incomplete.
if (rounds >= maxRounds) {
stopReason = 'max-rounds';
break;
}
}
return {
rounds,
executedCalls,
summary,
stopReason,
messages,
};
}

View file

@ -0,0 +1,117 @@
/**
* System-prompt builder for the function-calling planner.
*
* Radically smaller than the pre-migration text-JSON prompt: no tool
* listing (the LLM gets schemas via the native ``tools`` request
* field), no format example (the SDK enforces structured tool_calls),
* no "please return JSON" plea. We just tell the LLM what its job is,
* how to behave in a reasoning loop, and hand over control.
*
* The rendered prompt is ~400 tokens compared to the previous
* ~60008000 big savings on cost and, more importantly, on the
* signal-to-noise ratio the model has to filter.
*/
import type { Mission } from '../missions/types';
import type { ResolvedInput } from './types';
export interface SystemPromptInput {
readonly mission: Mission;
readonly resolvedInputs: readonly ResolvedInput[];
/** When set, included verbatim as the agent's persona frame. */
readonly agentSystemPrompt?: string | null;
/** When set, appended as the agent's persistent memory. */
readonly agentMemory?: string | null;
}
export interface SystemPromptOutput {
readonly systemPrompt: string;
readonly userPrompt: string;
}
export function buildSystemPrompt(input: SystemPromptInput): SystemPromptOutput {
const systemPrompt = buildSystemFrame(input);
const userPrompt = buildUserFrame(input);
return { systemPrompt, userPrompt };
}
function buildSystemFrame(input: SystemPromptInput): string {
const agentBlock = renderAgentContext(input);
return [
'Du arbeitest im Auftrag des Nutzers an einer langlebigen Mission.',
'',
'Dein Vorgehen:',
'1. Lies zuerst (Read-Tools liefern dir sofort Ergebnisse) — verstehe den Zustand, bevor du schreibst.',
'2. Führe anschließend die notwendigen Schreib-Tools aus, um das konkrete Ziel umzusetzen.',
'3. Wiederhole bis zu 5 Planungsrunden: nach jedem Tool-Aufruf bekommst du das Ergebnis zurück und kannst daraus den nächsten Schritt ableiten.',
'4. Stoppe, wenn das Ziel erreicht ist oder kein sinnvoller nächster Schritt bleibt.',
'5. Berücksichtige Feedback aus früheren Iterationen — wiederhole keinen Schritt, der zuvor fehlgeschlagen ist, ohne ihn zu ändern.',
'',
'Wichtig:',
'- Nutze ausschließlich die Tools, die dir als Function-Calls bereitgestellt werden. Nennungen in Prosa werden ignoriert.',
'- Wenn mehrere unabhängige Aktionen anstehen (z. B. "erstelle 8 Fragen"), gib sie in einem einzigen Turn als parallele Tool-Calls aus — das spart Runden.',
'- Wenn ein Tool einen Fehler zurückgibt, reagiere darauf (anderes Tool probieren oder stoppen) — ignoriere Fehler nicht.',
agentBlock,
]
.filter(Boolean)
.join('\n');
}
function renderAgentContext(input: SystemPromptInput): string {
const parts: string[] = [];
if (input.agentSystemPrompt?.trim()) {
parts.push('\n<agent_persona>');
parts.push(input.agentSystemPrompt.trim());
parts.push('</agent_persona>');
}
if (input.agentMemory?.trim()) {
parts.push('\n<agent_memory>');
parts.push(input.agentMemory.trim());
parts.push('</agent_memory>');
}
return parts.join('\n');
}
function buildUserFrame(input: SystemPromptInput): string {
const { mission, resolvedInputs } = input;
const inputsBlock =
resolvedInputs.length === 0
? '_(keine verlinkten Inputs)_'
: resolvedInputs
.map((r) => `### ${r.module}/${r.table}: ${r.title ?? r.id}\n${r.content}`)
.join('\n\n');
const iterationHistory =
mission.iterations.length === 0
? '_(erste Iteration)_'
: mission.iterations
.slice(-3)
.map((it) => {
const steps = it.plan.map((s) => ` - [${s.status}] ${s.summary}`).join('\n');
const feedback = it.userFeedback ? `\n Nutzer-Feedback: ${it.userFeedback}` : '';
const summary = it.summary ? `\n Summary: ${it.summary}` : '';
return `**${it.startedAt}** (${it.overallStatus}):${summary}\n${steps}${feedback}`;
})
.join('\n\n');
return [
`# Mission: ${mission.title}`,
'',
'## Konzept',
mission.conceptMarkdown || '_(leer)_',
'',
'## Konkretes Ziel',
mission.objective || '_(nicht gesetzt)_',
'',
'## Verlinkte Inputs',
inputsBlock,
'',
'## Letzte Iterationen (max. 3)',
iterationHistory,
'',
'---',
'',
'Beginne jetzt mit der nächsten Iteration. Rufe die nötigen Tools auf.',
].join('\n');
}