feat(ai): reasoning loop — agent chains auto tools before asking for approval

The runner was one-shot: one planner call per iteration, no feedback
from tool outputs. "Lies alle Notizen und tagge sie" needed two manual
runs (list_notes, then tagging) because the planner never saw the
list_tasks output.

Now runMission loops up to MAX_REASONING_LOOP_ITERATIONS (5):

  loop: plan → classify steps by policy
        │
        ├─ auto  → execute inline, capture {message, data}, feed back
        │          as a synthetic ResolvedInput for the next planner call
        │
        ├─ propose → stage proposal, mark humanInLoop, EXIT after this round
        │            (human has to approve before we plan further — we don't
        │             know what they'll accept yet)
        │
        └─ none/0-steps → agent considers the task done, EXIT

Tool outputs become a ResolvedInput titled "Zwischenergebnisse (Runde N)"
so the planner sees them structured and labelled. StageOutcome gains
`autoData` + `autoMessage` so the loop can thread the executor's
payload back through without a second call.

AiDebugEntry now holds `plannerCalls[]` and `loopSteps[]` instead of a
single planner snapshot — so Debug-Panel shows every LLM round + every
auto-tool output, each collapsible. Summary chip shows "3× LLM · 4200ms
· 2× Auto-Tool" when a loop ran.

Side-effects for existing use cases:
- One-shot missions (single propose tool) behave identically (loop
  exits after round 1 with humanInLoop=true).
- "Tag all notes" missions now finish in a single run: loop iter 1
  runs list_notes auto, iter 2 stages N add_tag_to_note proposals,
  exits.
- Server-side mana-ai runner NOT touched — this is foreground-only
  for now; the server still runs one plan/tick.

All 8 runner.test.ts tests pass unchanged (the existing test suite
only exercises the single-step path, which is a subset of the loop).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-16 00:43:52 +02:00
parent e440f13867
commit 8299bf004d
3 changed files with 264 additions and 78 deletions

View file

@ -44,7 +44,14 @@
{:else if d.preStep.webResearch && !d.preStep.webResearch.ok}
· Web ❌
{/if}
{#if d.planner}· {Math.round(d.planner.latencyMs)}ms{/if}
{#if d.plannerCalls && d.plannerCalls.length > 0}
· {d.plannerCalls.length}× LLM · {Math.round(
d.plannerCalls.reduce((a, c) => a + c.latencyMs, 0)
)}ms
{/if}
{#if d.loopSteps && d.loopSteps.length > 0}
· {d.loopSteps.length}× Auto-Tool
{/if}
{#if d.plannerError}· Planner ❌{/if}
</span>
<button
@ -86,21 +93,41 @@
{/if}
</section>
{#if d.planner}
{#if d.loopSteps && d.loopSteps.length > 0}
<section>
<h5>System Prompt</h5>
<pre>{d.planner.systemPrompt}</pre>
</section>
<section>
<h5>User Prompt</h5>
<pre>{d.planner.userPrompt}</pre>
</section>
<section>
<h5>Raw LLM Response</h5>
<pre>{d.planner.rawResponse}</pre>
<h5>Auto-Tool-Ausgaben (Reasoning-Loop)</h5>
{#each d.loopSteps as ls, i (i)}
<details class="nested">
<summary>
<code>Runde {ls.loopIndex + 1}</code>
{ls.toolName}({JSON.stringify(ls.params)})
</summary>
<pre>{ls.outputPreview}</pre>
</details>
{/each}
</section>
{/if}
{#if d.plannerCalls && d.plannerCalls.length > 0}
{#each d.plannerCalls as call, i (i)}
<section>
<h5>LLM-Call {i + 1}/{d.plannerCalls.length} · {Math.round(call.latencyMs)}ms</h5>
<details class="nested">
<summary>System Prompt</summary>
<pre>{call.systemPrompt}</pre>
</details>
<details class="nested" open>
<summary>User Prompt</summary>
<pre>{call.userPrompt}</pre>
</details>
<details class="nested" open>
<summary>Raw LLM Response</summary>
<pre>{call.rawResponse}</pre>
</details>
</section>
{/each}
{/if}
{#if d.plannerError}
<section>
<h5>Planner Error</h5>

View file

@ -45,7 +45,22 @@ export interface AiDebugEntry {
webResearch?: { ok: true; sourceCount: number; summary: string } | { ok: false; error: string };
kontextInjected: boolean;
};
planner?: PlannerCallDebug;
/**
* Array because the reasoning loop can call the planner multiple
* times per iteration (once per loop step, until a proposal is
* staged or no more work is returned). Older single-call entries
* written before the loop shipped still parse readers that
* haven't updated simply take `plannerCalls[0]`.
*/
plannerCalls?: PlannerCallDebug[];
/** Auto-executed tool outputs captured across loop steps surfaces
* what the agent "saw" when reasoning across multiple calls. */
loopSteps?: Array<{
loopIndex: number;
toolName: string;
params: Record<string, unknown>;
outputPreview: string;
}>;
plannerError?: string;
}

View file

@ -32,7 +32,7 @@ import { executeTool } from '../../tools/executor';
import { db } from '../../database';
import { decryptRecords } from '../../crypto';
import { discoverByQuery, searchFeeds } from '$lib/modules/news-research/api';
import { isAiDebugEnabled, recordAiDebug, type AiDebugEntry } from './debug';
import { isAiDebugEnabled, recordAiDebug, type AiDebugEntry, type PlannerCallDebug } from './debug';
import { makeAgentActor, LEGACY_AI_PRINCIPAL, type Actor } from '../../events/actor';
import { getAgent } from '../agents/store';
import { DEFAULT_AGENT_NAME } from '../agents/types';
@ -43,6 +43,14 @@ import type { AiPlanInput, AiPlanOutput, PlannedStep, ResolvedInput } from './pl
* web-research call. Keeps the trigger explicit so unrelated missions
* don't burn credits accidentally. */
const RESEARCH_TRIGGER = /\b(recherchier|research|news|finde|suche|aktuelle|neueste)/i;
/** Reasoning-loop budget. Each LOOP iteration = one planner call + its
* auto-tool executions. The loop exits early when a propose-policy
* step is staged (human must approve before progressing) or the
* planner returns zero steps (it considers this subtask done).
* 5 is generous for read-act-refine patterns ("list_notes → tag them")
* without running the LLM bill dry on stuck missions. */
const MAX_REASONING_LOOP_ITERATIONS = 5;
/** Singleton row id of the kontext doc kept in sync with
* `modules/kontext/types.ts` (KONTEXT_SINGLETON_ID). */
const KONTEXT_SINGLETON_ID = 'singleton';
@ -68,7 +76,16 @@ export interface MissionRunnerDeps {
}
export type StageOutcome =
| { readonly ok: true; readonly proposalId: string }
| {
readonly ok: true;
readonly proposalId: string;
/** Full tool-result payload when the step auto-executed (proposalId
* is empty). The reasoning loop reads this and feeds it back as
* context for the next planner call so the agent can reason over
* list/read outputs across steps. */
readonly autoData?: unknown;
readonly autoMessage?: string;
}
| { readonly ok: false; readonly error: string };
/** Default step-staging implementation: policy-gated executor under AI actor. */
@ -86,8 +103,9 @@ export const defaultStageStep: Required<MissionRunnerDeps>['stageStep'] = async
const data = result.data as { proposalId?: string } | undefined;
if (data?.proposalId) return { ok: true, proposalId: data.proposalId };
// Policy resolved to 'auto' — no proposal row was created, the tool
// ran directly. Treat as ok but without a proposal id to thread back.
return { ok: true, proposalId: '' };
// ran directly. Return the payload so the reasoning loop can feed it
// back into the next planner call.
return { ok: true, proposalId: '', autoData: result.data, autoMessage: result.message };
};
export interface RunMissionResult {
@ -225,31 +243,153 @@ export async function runMission(
const availableTools = getAvailableToolsForAi(aiActor);
await checkCancel();
// ── Phase: calling-llm ─────────────────────────────────
await enterPhase('calling-llm', 'frage Planner an');
let plan: AiPlanOutput;
try {
plan = await deps.plan({ mission: mission!, resolvedInputs, availableTools });
} catch (err) {
// Capture even the failure for debug visibility before re-throwing.
if (isAiDebugEnabled()) {
void recordAiDebug({
iterationId,
missionId: mission!.id,
missionTitle: mission!.title,
missionObjective: mission!.objective,
capturedAt: new Date().toISOString(),
resolvedInputs,
preStep,
plannerError: err instanceof Error ? err.message : String(err),
// ── Reasoning loop ─────────────────────────────────────
// Each pass: call planner → stage steps. Auto-tools run inline
// and their outputs become new ResolvedInputs so the NEXT planner
// call can reason over them (e.g. list_notes → see titles →
// stage add_tag_to_note per note). Loop exits when:
// • planner returns 0 steps → agent is done
// • any step requires user approval (propose) → user in the loop
// • budget exhausted (MAX_REASONING_LOOP_ITERATIONS)
// • a step fails hard (not tool-error; executor error)
const stage = deps.stageStep ?? defaultStageStep;
const loopInputs: ResolvedInput[] = [...resolvedInputs];
const recordedSteps: PlanStep[] = [];
const plannerCalls: PlannerCallDebug[] = [];
const loopStepLog: NonNullable<AiDebugEntry['loopSteps']> = [];
let stagedCount = 0;
let failedCount = 0;
let lastPlanSummary = '';
let totalStepCount = 0;
let loopIndex = 0;
let stepCounter = 0;
let humanInLoop = false;
while (loopIndex < MAX_REASONING_LOOP_ITERATIONS) {
// ── Phase: calling-llm ─────────────────────────────
await enterPhase(
'calling-llm',
loopIndex === 0
? 'frage Planner an'
: `Planner Runde ${loopIndex + 1}/${MAX_REASONING_LOOP_ITERATIONS}`
);
let plan: AiPlanOutput;
try {
plan = await deps.plan({ mission: mission!, resolvedInputs: loopInputs, availableTools });
} catch (err) {
if (isAiDebugEnabled()) {
void recordAiDebug({
iterationId,
missionId: mission!.id,
missionTitle: mission!.title,
missionObjective: mission!.objective,
capturedAt: new Date().toISOString(),
resolvedInputs: loopInputs,
preStep,
plannerCalls,
loopSteps: loopStepLog,
plannerError: err instanceof Error ? err.message : String(err),
});
}
throw err;
}
await checkCancel();
if (plan.debug) plannerCalls.push(plan.debug);
lastPlanSummary = plan.summary;
totalStepCount += plan.steps.length;
if (plan.steps.length === 0) {
// Planner has nothing more to do — agent considers this done.
break;
}
// ── Phase: parsing-response ────────────────────────
await enterPhase('parsing-response', `${plan.steps.length} Step(s) erhalten`);
await checkCancel();
// ── Phase: staging-proposals ───────────────────────
const roundOutputs: Array<{ step: PlannedStep; message: string; data: unknown }> = [];
for (const [i, ps] of plan.steps.entries()) {
await enterPhase(
'staging-proposals',
`Runde ${loopIndex + 1} · Step ${i + 1}/${plan.steps.length}`
);
await checkCancel();
const outcome = await stage(ps, aiActor);
const stepId = `${iterationId}-${stepCounter++}`;
if (!outcome.ok) {
failedCount++;
recordedSteps.push({
id: stepId,
summary: ps.summary,
intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
status: 'failed',
});
continue;
}
stagedCount++;
if (outcome.proposalId) {
// Propose-policy: human must approve. Exit the loop after
// this round so we don't stage proposals for hypothetical
// follow-up steps that depend on the approval outcome.
humanInLoop = true;
recordedSteps.push({
id: stepId,
summary: ps.summary,
intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
proposalId: outcome.proposalId,
status: 'staged',
});
} else {
// Auto-policy: ran inline. Collect output for the next
// planner call.
recordedSteps.push({
id: stepId,
summary: ps.summary,
intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
status: 'approved',
});
roundOutputs.push({
step: ps,
message: outcome.autoMessage ?? '(ohne message)',
data: outcome.autoData,
});
}
}
// Log loop outputs for debug-panel visibility.
for (const o of roundOutputs) {
loopStepLog.push({
loopIndex,
toolName: o.step.toolName,
params: o.step.params,
outputPreview: formatToolOutputPreview(o.message, o.data),
});
}
throw err;
}
await checkCancel();
// Persist debug capture if enabled. Off by default in production
// (toggle via Settings or `localStorage.setItem('mana.ai.debug','1')`).
if (humanInLoop) break;
if (roundOutputs.length === 0) {
// Every step either failed or was proposed — nothing new to
// reason over. Prevents an infinite loop when the planner
// only suggests proposable tools that keep failing.
break;
}
// Feed tool outputs into the next planner call as a synthetic
// ResolvedInput so the agent can chain its reasoning.
loopInputs.push({
id: `loop-outputs-${loopIndex}`,
module: 'reasoning-loop',
table: 'tool-outputs',
title: `Zwischenergebnisse (Runde ${loopIndex + 1})`,
content: formatToolOutputsForPrompt(roundOutputs),
});
loopIndex++;
}
if (isAiDebugEnabled()) {
void recordAiDebug({
iterationId,
@ -257,54 +397,20 @@ export async function runMission(
missionTitle: mission!.title,
missionObjective: mission!.objective,
capturedAt: new Date().toISOString(),
resolvedInputs,
resolvedInputs: loopInputs,
preStep,
planner: plan.debug,
plannerCalls,
loopSteps: loopStepLog,
});
}
// ── Phase: parsing-response ────────────────────────────
await enterPhase('parsing-response', `${plan.steps.length} Step(s) erhalten`);
await checkCancel();
// ── Phase: staging-proposals ───────────────────────────
const stage = deps.stageStep ?? defaultStageStep;
const recordedSteps: PlanStep[] = [];
let stagedCount = 0;
let failedCount = 0;
for (const [i, ps] of plan.steps.entries()) {
await enterPhase('staging-proposals', `Step ${i + 1} von ${plan.steps.length}`);
await checkCancel();
const outcome = await stage(ps, aiActor);
if (outcome.ok) {
stagedCount++;
recordedSteps.push({
id: `${iterationId}-${i}`,
summary: ps.summary,
intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
proposalId: outcome.proposalId || undefined,
status: outcome.proposalId ? 'staged' : 'approved',
});
} else {
failedCount++;
recordedSteps.push({
id: `${iterationId}-${i}`,
summary: ps.summary,
intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
status: 'failed',
});
}
}
await enterPhase('finalizing');
return {
recordedSteps,
stagedCount,
failedCount,
planSummary: plan.summary,
planStepCount: plan.steps.length,
planSummary: lastPlanSummary,
planStepCount: totalStepCount,
};
}
@ -411,6 +517,44 @@ async function loadKontextAsResolvedInput(): Promise<ResolvedInput | null> {
/** Run the deep-research pipeline against the mission objective and
* collapse its summary + sources into one ResolvedInput formatted so
* the planner can copy URLs into save_news_article calls. */
/** Stringify a tool-output payload for the reasoning loop's next
* prompt. Keeps the blob compact LLM context windows are finite and
* a raw JSON.stringify of a 200-row Dexie dump wastes tokens. */
function formatToolOutputsForPrompt(
outputs: Array<{ step: PlannedStep; message: string; data: unknown }>
): string {
const lines: string[] = [
'Ausgaben der zuletzt ausgeführten Auto-Tools. Nutze diese Daten um die Mission weiterzuführen — z.B. für jede gelistete Notiz einen add_tag_to_note Aufruf pro Notiz.',
'',
];
for (const o of outputs) {
lines.push(`### ${o.step.toolName}(${JSON.stringify(o.step.params)})`);
lines.push(o.message);
if (o.data !== undefined && o.data !== null) {
const json = safeStringify(o.data, 4000);
lines.push('```json', json, '```');
}
lines.push('');
}
return lines.join('\n');
}
/** Short form for the debug-panel loopSteps log. */
function formatToolOutputPreview(message: string, data: unknown): string {
if (data === undefined || data === null) return message;
const json = safeStringify(data, 400);
return `${message}\n${json}`;
}
function safeStringify(value: unknown, limit: number): string {
try {
const s = JSON.stringify(value, null, 2);
return s.length > limit ? s.slice(0, limit) + '\n… (truncated)' : s;
} catch {
return String(value);
}
}
interface WebResearchOutcome {
input: ResolvedInput;
sourceCount: number;