feat(ai): reasoning loop — agent chains auto tools before asking for approval

The runner was one-shot: one planner call per iteration, no feedback from tool outputs. "Lies alle Notizen und tagge sie" needed two manual runs (list_notes, then tagging) because the planner never saw the list_tasks output. Now runMission loops up to MAX_REASONING_LOOP_ITERATIONS (5): loop: plan → classify steps by policy │ ├─ auto → execute inline, capture {message, data}, feed back │ as a synthetic ResolvedInput for the next planner call │ ├─ propose → stage proposal, mark humanInLoop, EXIT after this round │ (human has to approve before we plan further — we don't │ know what they'll accept yet) │ └─ none/0-steps → agent considers the task done, EXIT Tool outputs become a ResolvedInput titled "Zwischenergebnisse (Runde N)" so the planner sees them structured and labelled. StageOutcome gains `autoData` + `autoMessage` so the loop can thread the executor's payload back through without a second call. AiDebugEntry now holds `plannerCalls[]` and `loopSteps[]` instead of a single planner snapshot — so Debug-Panel shows every LLM round + every auto-tool output, each collapsible. Summary chip shows "3× LLM · 4200ms · 2× Auto-Tool" when a loop ran. Side-effects for existing use cases: - One-shot missions (single propose tool) behave identically (loop exits after round 1 with humanInLoop=true). - "Tag all notes" missions now finish in a single run: loop iter 1 runs list_notes auto, iter 2 stages N add_tag_to_note proposals, exits. - Server-side mana-ai runner NOT touched — this is foreground-only for now; the server still runs one plan/tick. All 8 runner.test.ts tests pass unchanged (the existing test suite only exercises the single-step path, which is a subset of the loop). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 21:21:10 +02:00 · 2026-04-16 00:43:52 +02:00 · 2026-04-16 00:43:52 +02:00 · 8299bf004d
commit 8299bf004d
parent e440f13867
3 changed files with 264 additions and 78 deletions
--- a/apps/mana/apps/web/src/lib/components/ai/AiDebugBlock.svelte
+++ b/apps/mana/apps/web/src/lib/components/ai/AiDebugBlock.svelte
@ -44,7 +44,14 @@
 				{:else if d.preStep.webResearch && !d.preStep.webResearch.ok}
 					· Web ❌
 				{/if}
-				{#if d.planner}· {Math.round(d.planner.latencyMs)}ms{/if}
+				{#if d.plannerCalls && d.plannerCalls.length > 0}
+					· {d.plannerCalls.length}× LLM · {Math.round(
+						d.plannerCalls.reduce((a, c) => a + c.latencyMs, 0)
+					)}ms
+				{/if}
+				{#if d.loopSteps && d.loopSteps.length > 0}
+					· {d.loopSteps.length}× Auto-Tool
+				{/if}
 				{#if d.plannerError}· Planner ❌{/if}
 			</span>
 			<button
@ -86,21 +93,41 @@
 			{/if}
 		</section>

-		{#if d.planner}
+		{#if d.loopSteps && d.loopSteps.length > 0}
 			<section>
-				<h5>System Prompt</h5>
-				<pre>{d.planner.systemPrompt}</pre>
-			</section>
-			<section>
-				<h5>User Prompt</h5>
-				<pre>{d.planner.userPrompt}</pre>
-			</section>
-			<section>
-				<h5>Raw LLM Response</h5>
-				<pre>{d.planner.rawResponse}</pre>
+				<h5>Auto-Tool-Ausgaben (Reasoning-Loop)</h5>
+				{#each d.loopSteps as ls, i (i)}
+					<details class="nested">
+						<summary>
+							<code>Runde {ls.loopIndex + 1}</code>
+							{ls.toolName}({JSON.stringify(ls.params)})
+						</summary>
+						<pre>{ls.outputPreview}</pre>
+					</details>
+				{/each}
 			</section>
 		{/if}

+		{#if d.plannerCalls && d.plannerCalls.length > 0}
+			{#each d.plannerCalls as call, i (i)}
+				<section>
+					<h5>LLM-Call {i + 1}/{d.plannerCalls.length} · {Math.round(call.latencyMs)}ms</h5>
+					<details class="nested">
+						<summary>System Prompt</summary>
+						<pre>{call.systemPrompt}</pre>
+					</details>
+					<details class="nested" open>
+						<summary>User Prompt</summary>
+						<pre>{call.userPrompt}</pre>
+					</details>
+					<details class="nested" open>
+						<summary>Raw LLM Response</summary>
+						<pre>{call.rawResponse}</pre>
+					</details>
+				</section>
+			{/each}
+		{/if}
+
 		{#if d.plannerError}
 			<section>
 				<h5>Planner Error</h5>
--- a/apps/mana/apps/web/src/lib/data/ai/missions/debug.ts
+++ b/apps/mana/apps/web/src/lib/data/ai/missions/debug.ts
@ -45,7 +45,22 @@ export interface AiDebugEntry {
 		webResearch?: { ok: true; sourceCount: number; summary: string } | { ok: false; error: string };
 		kontextInjected: boolean;
 	};
-	planner?: PlannerCallDebug;
+	/**
+	 * Array because the reasoning loop can call the planner multiple
+	 * times per iteration (once per loop step, until a proposal is
+	 * staged or no more work is returned). Older single-call entries
+	 * written before the loop shipped still parse — readers that
+	 * haven't updated simply take `plannerCalls[0]`.
+	 */
+	plannerCalls?: PlannerCallDebug[];
+	/** Auto-executed tool outputs captured across loop steps — surfaces
+	 *  what the agent "saw" when reasoning across multiple calls. */
+	loopSteps?: Array<{
+		loopIndex: number;
+		toolName: string;
+		params: Record<string, unknown>;
+		outputPreview: string;
+	}>;
 	plannerError?: string;
 }

--- a/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts
+++ b/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts
@ -32,7 +32,7 @@ import { executeTool } from '../../tools/executor';
 import { db } from '../../database';
 import { decryptRecords } from '../../crypto';
 import { discoverByQuery, searchFeeds } from '$lib/modules/news-research/api';
-import { isAiDebugEnabled, recordAiDebug, type AiDebugEntry } from './debug';
+import { isAiDebugEnabled, recordAiDebug, type AiDebugEntry, type PlannerCallDebug } from './debug';
 import { makeAgentActor, LEGACY_AI_PRINCIPAL, type Actor } from '../../events/actor';
 import { getAgent } from '../agents/store';
 import { DEFAULT_AGENT_NAME } from '../agents/types';
@ -43,6 +43,14 @@ import type { AiPlanInput, AiPlanOutput, PlannedStep, ResolvedInput } from './pl
 *  web-research call. Keeps the trigger explicit so unrelated missions
 *  don't burn credits accidentally. */
 const RESEARCH_TRIGGER = /\b(recherchier|research|news|finde|suche|aktuelle|neueste)/i;
+
+/** Reasoning-loop budget. Each LOOP iteration = one planner call + its
+ *  auto-tool executions. The loop exits early when a propose-policy
+ *  step is staged (human must approve before progressing) or the
+ *  planner returns zero steps (it considers this subtask done).
+ *  5 is generous for read-act-refine patterns ("list_notes → tag them")
+ *  without running the LLM bill dry on stuck missions. */
+const MAX_REASONING_LOOP_ITERATIONS = 5;
 /** Singleton row id of the kontext doc — kept in sync with
 *  `modules/kontext/types.ts` (KONTEXT_SINGLETON_ID). */
 const KONTEXT_SINGLETON_ID = 'singleton';
@ -68,7 +76,16 @@ export interface MissionRunnerDeps {
 }

 export type StageOutcome =
-	| { readonly ok: true; readonly proposalId: string }
+	| {
+			readonly ok: true;
+			readonly proposalId: string;
+			/** Full tool-result payload when the step auto-executed (proposalId
+			 *  is empty). The reasoning loop reads this and feeds it back as
+			 *  context for the next planner call so the agent can reason over
+			 *  list/read outputs across steps. */
+			readonly autoData?: unknown;
+			readonly autoMessage?: string;
+	  }
 	| { readonly ok: false; readonly error: string };

 /** Default step-staging implementation: policy-gated executor under AI actor. */
@ -86,8 +103,9 @@ export const defaultStageStep: Required<MissionRunnerDeps>['stageStep'] = async
 	const data = result.data as { proposalId?: string } | undefined;
 	if (data?.proposalId) return { ok: true, proposalId: data.proposalId };
 	// Policy resolved to 'auto' — no proposal row was created, the tool
-	// ran directly. Treat as ok but without a proposal id to thread back.
-	return { ok: true, proposalId: '' };
+	// ran directly. Return the payload so the reasoning loop can feed it
+	// back into the next planner call.
+	return { ok: true, proposalId: '', autoData: result.data, autoMessage: result.message };
 };

 export interface RunMissionResult {
@ -225,31 +243,153 @@ export async function runMission(
 		const availableTools = getAvailableToolsForAi(aiActor);
 		await checkCancel();

-		// ── Phase: calling-llm ─────────────────────────────────
-		await enterPhase('calling-llm', 'frage Planner an');
-		let plan: AiPlanOutput;
-		try {
-			plan = await deps.plan({ mission: mission!, resolvedInputs, availableTools });
-		} catch (err) {
-			// Capture even the failure for debug visibility before re-throwing.
-			if (isAiDebugEnabled()) {
-				void recordAiDebug({
-					iterationId,
-					missionId: mission!.id,
-					missionTitle: mission!.title,
-					missionObjective: mission!.objective,
-					capturedAt: new Date().toISOString(),
-					resolvedInputs,
-					preStep,
-					plannerError: err instanceof Error ? err.message : String(err),
+		// ── Reasoning loop ─────────────────────────────────────
+		// Each pass: call planner → stage steps. Auto-tools run inline
+		// and their outputs become new ResolvedInputs so the NEXT planner
+		// call can reason over them (e.g. list_notes → see titles →
+		// stage add_tag_to_note per note). Loop exits when:
+		//   • planner returns 0 steps                  → agent is done
+		//   • any step requires user approval (propose) → user in the loop
+		//   • budget exhausted (MAX_REASONING_LOOP_ITERATIONS)
+		//   • a step fails hard (not tool-error; executor error)
+		const stage = deps.stageStep ?? defaultStageStep;
+		const loopInputs: ResolvedInput[] = [...resolvedInputs];
+		const recordedSteps: PlanStep[] = [];
+		const plannerCalls: PlannerCallDebug[] = [];
+		const loopStepLog: NonNullable<AiDebugEntry['loopSteps']> = [];
+		let stagedCount = 0;
+		let failedCount = 0;
+		let lastPlanSummary = '';
+		let totalStepCount = 0;
+		let loopIndex = 0;
+		let stepCounter = 0;
+		let humanInLoop = false;
+
+		while (loopIndex < MAX_REASONING_LOOP_ITERATIONS) {
+			// ── Phase: calling-llm ─────────────────────────────
+			await enterPhase(
+				'calling-llm',
+				loopIndex === 0
+					? 'frage Planner an'
+					: `Planner Runde ${loopIndex + 1}/${MAX_REASONING_LOOP_ITERATIONS}`
+			);
+			let plan: AiPlanOutput;
+			try {
+				plan = await deps.plan({ mission: mission!, resolvedInputs: loopInputs, availableTools });
+			} catch (err) {
+				if (isAiDebugEnabled()) {
+					void recordAiDebug({
+						iterationId,
+						missionId: mission!.id,
+						missionTitle: mission!.title,
+						missionObjective: mission!.objective,
+						capturedAt: new Date().toISOString(),
+						resolvedInputs: loopInputs,
+						preStep,
+						plannerCalls,
+						loopSteps: loopStepLog,
+						plannerError: err instanceof Error ? err.message : String(err),
+					});
+				}
+				throw err;
+			}
+			await checkCancel();
+			if (plan.debug) plannerCalls.push(plan.debug);
+			lastPlanSummary = plan.summary;
+			totalStepCount += plan.steps.length;
+
+			if (plan.steps.length === 0) {
+				// Planner has nothing more to do — agent considers this done.
+				break;
+			}
+
+			// ── Phase: parsing-response ────────────────────────
+			await enterPhase('parsing-response', `${plan.steps.length} Step(s) erhalten`);
+			await checkCancel();
+
+			// ── Phase: staging-proposals ───────────────────────
+			const roundOutputs: Array<{ step: PlannedStep; message: string; data: unknown }> = [];
+			for (const [i, ps] of plan.steps.entries()) {
+				await enterPhase(
+					'staging-proposals',
+					`Runde ${loopIndex + 1} · Step ${i + 1}/${plan.steps.length}`
+				);
+				await checkCancel();
+
+				const outcome = await stage(ps, aiActor);
+				const stepId = `${iterationId}-${stepCounter++}`;
+				if (!outcome.ok) {
+					failedCount++;
+					recordedSteps.push({
+						id: stepId,
+						summary: ps.summary,
+						intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
+						status: 'failed',
+					});
+					continue;
+				}
+
+				stagedCount++;
+				if (outcome.proposalId) {
+					// Propose-policy: human must approve. Exit the loop after
+					// this round so we don't stage proposals for hypothetical
+					// follow-up steps that depend on the approval outcome.
+					humanInLoop = true;
+					recordedSteps.push({
+						id: stepId,
+						summary: ps.summary,
+						intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
+						proposalId: outcome.proposalId,
+						status: 'staged',
+					});
+				} else {
+					// Auto-policy: ran inline. Collect output for the next
+					// planner call.
+					recordedSteps.push({
+						id: stepId,
+						summary: ps.summary,
+						intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
+						status: 'approved',
+					});
+					roundOutputs.push({
+						step: ps,
+						message: outcome.autoMessage ?? '(ohne message)',
+						data: outcome.autoData,
+					});
+				}
+			}
+
+			// Log loop outputs for debug-panel visibility.
+			for (const o of roundOutputs) {
+				loopStepLog.push({
+					loopIndex,
+					toolName: o.step.toolName,
+					params: o.step.params,
+					outputPreview: formatToolOutputPreview(o.message, o.data),
 				});
 			}
-			throw err;
-		}
-		await checkCancel();

-		// Persist debug capture if enabled. Off by default in production
-		// (toggle via Settings or `localStorage.setItem('mana.ai.debug','1')`).
+			if (humanInLoop) break;
+			if (roundOutputs.length === 0) {
+				// Every step either failed or was proposed — nothing new to
+				// reason over. Prevents an infinite loop when the planner
+				// only suggests proposable tools that keep failing.
+				break;
+			}
+
+			// Feed tool outputs into the next planner call as a synthetic
+			// ResolvedInput so the agent can chain its reasoning.
+			loopInputs.push({
+				id: `loop-outputs-${loopIndex}`,
+				module: 'reasoning-loop',
+				table: 'tool-outputs',
+				title: `Zwischenergebnisse (Runde ${loopIndex + 1})`,
+				content: formatToolOutputsForPrompt(roundOutputs),
+			});
+
+			loopIndex++;
+		}
+
 		if (isAiDebugEnabled()) {
 			void recordAiDebug({
 				iterationId,
@ -257,54 +397,20 @@ export async function runMission(
 				missionTitle: mission!.title,
 				missionObjective: mission!.objective,
 				capturedAt: new Date().toISOString(),
-				resolvedInputs,
+				resolvedInputs: loopInputs,
 				preStep,
-				planner: plan.debug,
+				plannerCalls,
+				loopSteps: loopStepLog,
 			});
 		}

-		// ── Phase: parsing-response ────────────────────────────
-		await enterPhase('parsing-response', `${plan.steps.length} Step(s) erhalten`);
-		await checkCancel();
-
-		// ── Phase: staging-proposals ───────────────────────────
-		const stage = deps.stageStep ?? defaultStageStep;
-		const recordedSteps: PlanStep[] = [];
-		let stagedCount = 0;
-		let failedCount = 0;
-
-		for (const [i, ps] of plan.steps.entries()) {
-			await enterPhase('staging-proposals', `Step ${i + 1} von ${plan.steps.length}`);
-			await checkCancel();
-
-			const outcome = await stage(ps, aiActor);
-			if (outcome.ok) {
-				stagedCount++;
-				recordedSteps.push({
-					id: `${iterationId}-${i}`,
-					summary: ps.summary,
-					intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
-					proposalId: outcome.proposalId || undefined,
-					status: outcome.proposalId ? 'staged' : 'approved',
-				});
-			} else {
-				failedCount++;
-				recordedSteps.push({
-					id: `${iterationId}-${i}`,
-					summary: ps.summary,
-					intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
-					status: 'failed',
-				});
-			}
-		}
-
 		await enterPhase('finalizing');
 		return {
 			recordedSteps,
 			stagedCount,
 			failedCount,
-			planSummary: plan.summary,
-			planStepCount: plan.steps.length,
+			planSummary: lastPlanSummary,
+			planStepCount: totalStepCount,
 		};
 	}

@ -411,6 +517,44 @@ async function loadKontextAsResolvedInput(): Promise<ResolvedInput | null> {
 /** Run the deep-research pipeline against the mission objective and
 *  collapse its summary + sources into one ResolvedInput formatted so
 *  the planner can copy URLs into save_news_article calls. */
+/** Stringify a tool-output payload for the reasoning loop's next
+ *  prompt. Keeps the blob compact — LLM context windows are finite and
+ *  a raw JSON.stringify of a 200-row Dexie dump wastes tokens. */
+function formatToolOutputsForPrompt(
+	outputs: Array<{ step: PlannedStep; message: string; data: unknown }>
+): string {
+	const lines: string[] = [
+		'Ausgaben der zuletzt ausgeführten Auto-Tools. Nutze diese Daten um die Mission weiterzuführen — z.B. für jede gelistete Notiz einen add_tag_to_note Aufruf pro Notiz.',
+		'',
+	];
+	for (const o of outputs) {
+		lines.push(`### ${o.step.toolName}(${JSON.stringify(o.step.params)})`);
+		lines.push(o.message);
+		if (o.data !== undefined && o.data !== null) {
+			const json = safeStringify(o.data, 4000);
+			lines.push('```json', json, '```');
+		}
+		lines.push('');
+	}
+	return lines.join('\n');
+}
+
+/** Short form for the debug-panel loopSteps log. */
+function formatToolOutputPreview(message: string, data: unknown): string {
+	if (data === undefined || data === null) return message;
+	const json = safeStringify(data, 400);
+	return `${message}\n${json}`;
+}
+
+function safeStringify(value: unknown, limit: number): string {
+	try {
+		const s = JSON.stringify(value, null, 2);
+		return s.length > limit ? s.slice(0, limit) + '\n… (truncated)' : s;
+	} catch {
+		return String(value);
+	}
+}
+
 interface WebResearchOutcome {
 	input: ResolvedInput;
 	sourceCount: number;