feat(shared-ai): LLM-facing task tool wrapper for runSubAgent (M3.2)

Exposes runSubAgent() as a tool the planner LLM can call natively, matching Claude Code's `Task` tool shape: { subagent_type, description, prompt } -> single-string summary. New exports from @mana/shared-ai: - TASK_TOOL_NAME = 'task' - TASK_TOOL_SCHEMA — ToolSchema ready to drop into a runPlannerLoop `tools` array. subagent_type enum = research|plan|general; description+prompt required; defaultPolicy: 'auto' (control-flow, not a user-data write). - createTaskToolHandler(opts) — factory returning: - handle(call): structured ToolResult with the sub-agent's summary as message + data {subAgentType, toolsCalled, rounds, stopReason, usage} - cumulativeUsage(): rolled-up TokenUsage across all sub-agent invocations — parent budget accounting reads from here - invocationCount(): metric-ready counter Why not in mana-tool-registry: `task` is a loop-internal control-flow primitive, not a user-data operation. Registry is for habits/notes/etc. where MCP exposure and space-scoping matter. task never touches mana- sync and never crosses the MCP boundary. Recursion guard is defense-in-depth: the primitive throws SubAgentRecursionError, this handler catches parentDepth >= MAX_SUB_AGENT_DEPTH up front and returns a structured ToolResult instead so the LLM sees it as regular tool-feedback. Exceptions from the sub-agent (provider down, network) get wrapped as `{ success: false, message: 'Sub-agent failed: ...' }`. The parent loop's round continues. 14 new tests covering schema shape, recursion rejection, argument validation (4 cases), happy path with tool dispatch, cumulative usage tracking across multiple invocations, exception wrapping, and parent-dispatcher routing. 107 shared-ai tests green total (was 93). M3.3 consumer wiring follows. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 20:21:09 +02:00 · 2026-04-23 19:05:09 +02:00 · 2026-04-23 19:05:09 +02:00 · 101af462a8
commit 101af462a8
parent 7e3f53f8a5
4 changed files with 489 additions and 0 deletions
--- a/packages/shared-ai/src/index.ts
+++ b/packages/shared-ai/src/index.ts
@ -90,6 +90,7 @@ export {
 	DEFAULT_COMPACT_KEEP_RECENT,
 	DEFAULT_COMPACT_MODEL,
 	DEFAULT_COMPACT_THRESHOLD,
 	createTaskToolHandler,
 	MAX_SUB_AGENT_DEPTH,
 	MockLlmClient,
 	parseCompactSummary,
@ -99,6 +100,8 @@ export {
 	runSubAgent,
 	shouldCompact,
 	SubAgentRecursionError,
 	TASK_TOOL_NAME,
 	TASK_TOOL_SCHEMA,
 } from './planner';
 export type {
 	CompactHistoryOptions,
@ -107,6 +110,8 @@ export type {
 	RunSubAgentInput,
 	SubAgentResult,
 	SubAgentType,
 	TaskToolHandler,
 	TaskToolHandlerOptions,
 } from './planner';
 export {
--- a/packages/shared-ai/src/planner/index.ts
+++ b/packages/shared-ai/src/planner/index.ts
@ -23,6 +23,8 @@ export {
 export type { CompactHistoryOptions, CompactHistoryResult, CompactSummary } from './compact';
 export { MAX_SUB_AGENT_DEPTH, SubAgentRecursionError, runSubAgent } from './sub-agent';
 export type { RunSubAgentInput, SubAgentResult, SubAgentType } from './sub-agent';
 export { TASK_TOOL_NAME, TASK_TOOL_SCHEMA, createTaskToolHandler } from './task-tool';
 export type { TaskToolHandler, TaskToolHandlerOptions } from './task-tool';
 export { MockLlmClient } from './mock-llm';
 export type { MockLlmTurn } from './mock-llm';
 export type {
--- a/packages/shared-ai/src/planner/task-tool.test.ts
+++ b/packages/shared-ai/src/planner/task-tool.test.ts
@ -0,0 +1,264 @@
 import { describe, expect, it, vi } from 'vitest';
 import { TASK_TOOL_NAME, TASK_TOOL_SCHEMA, createTaskToolHandler } from './task-tool';
 import { MAX_SUB_AGENT_DEPTH } from './sub-agent';
 import { MockLlmClient } from './mock-llm';
 import type { ToolCallRequest, ToolResult } from './loop';
 import type { ToolSchema } from '../tools/schemas';
 const parentTools: ToolSchema[] = [
 	{
 		name: 'list_things',
 		module: 'test',
 		description: 'read',
 		defaultPolicy: 'auto',
 		parameters: [],
 	},
 	{
 		name: 'create_thing',
 		module: 'test',
 		description: 'write',
 		defaultPolicy: 'propose',
 		parameters: [{ name: 'title', type: 'string', description: 't', required: true }],
 	},
 ];
 function makeCall(args: Record<string, unknown>): ToolCallRequest {
 	return { id: 'tc-1', name: TASK_TOOL_NAME, arguments: args };
 }
 // ─── Schema shape ──────────────────────────────────────────────────
 describe('TASK_TOOL_SCHEMA', () => {
 	it('is named "task"', () => {
 		expect(TASK_TOOL_SCHEMA.name).toBe('task');
 		expect(TASK_TOOL_NAME).toBe('task');
 	});
 	it('carries subagent_type enum with research/plan/general', () => {
 		const typeParam = TASK_TOOL_SCHEMA.parameters.find((p) => p.name === 'subagent_type');
 		expect(typeParam).toBeDefined();
 		expect(typeParam!.enum).toEqual(['research', 'plan', 'general']);
 	});
 	it('requires description + prompt + subagent_type', () => {
 		const required = TASK_TOOL_SCHEMA.parameters.filter((p) => p.required).map((p) => p.name);
 		expect(required).toEqual(['subagent_type', 'description', 'prompt']);
 	});
 	it('defaultPolicy is auto (control-flow primitive, not a write)', () => {
 		expect(TASK_TOOL_SCHEMA.defaultPolicy).toBe('auto');
 	});
 });
 // ─── Recursion rejection ───────────────────────────────────────────
 describe('createTaskToolHandler — recursion', () => {
 	it('refuses when parentDepth is at the cap (structured error, not throw)', async () => {
 		const handler = createTaskToolHandler({
 			llm: new MockLlmClient(),
 			model: 'x/y',
 			parentDepth: MAX_SUB_AGENT_DEPTH,
 			parentTools,
 			parentOnToolCall: async () => ({ success: true, message: '' }),
 		});
 		const res = await handler.handle(
 			makeCall({ subagent_type: 'research', description: 'nested', prompt: 'do it' })
 		);
 		expect(res.success).toBe(false);
 		expect(res.message).toContain('nicht verschachtelt');
 	});
 });
 // ─── Input validation ──────────────────────────────────────────────
 describe('createTaskToolHandler — argument validation', () => {
 	function make() {
 		return createTaskToolHandler({
 			llm: new MockLlmClient(),
 			model: 'x/y',
 			parentDepth: 0,
 			parentTools,
 			parentOnToolCall: async () => ({ success: true, message: '' }),
 		});
 	}
 	it('rejects non-object args', async () => {
 		const res = await make().handle({
 			id: 't',
 			name: 'task',
 			arguments: null as unknown as Record<string, unknown>,
 		});
 		expect(res.success).toBe(false);
 		expect(res.message).toContain('object');
 	});
 	it('rejects invalid subagent_type', async () => {
 		const res = await make().handle(
 			makeCall({ subagent_type: 'evil', description: 'x', prompt: 'y' })
 		);
 		expect(res.success).toBe(false);
 		expect(res.message).toContain('research|plan|general');
 	});
 	it('rejects empty description', async () => {
 		const res = await make().handle(
 			makeCall({ subagent_type: 'research', description: '', prompt: 'y' })
 		);
 		expect(res.success).toBe(false);
 		expect(res.message).toContain('description');
 	});
 	it('rejects empty prompt', async () => {
 		const res = await make().handle(
 			makeCall({ subagent_type: 'research', description: 'x', prompt: '' })
 		);
 		expect(res.success).toBe(false);
 		expect(res.message).toContain('prompt');
 	});
 });
 // ─── Happy path ────────────────────────────────────────────────────
 describe('createTaskToolHandler — happy path', () => {
 	it('spawns a sub-agent and returns its summary as ToolResult.message', async () => {
 		const llm = new MockLlmClient()
 			.enqueueToolCalls([{ name: 'list_things', args: {} }])
 			.enqueueStop('Found 2 items: a, b');
 		const parentDispatch = vi.fn(
 			async (_c: ToolCallRequest): Promise<ToolResult> => ({
 				success: true,
 				data: ['a', 'b'],
 				message: '2 items',
 			})
 		);
 		const handler = createTaskToolHandler({
 			llm,
 			model: 'google/gemini-2.5-flash-lite',
 			parentDepth: 0,
 			parentTools,
 			parentOnToolCall: parentDispatch,
 		});
 		const res = await handler.handle(
 			makeCall({
 				subagent_type: 'research',
 				description: 'scan things',
 				prompt: 'list everything and report back',
 			})
 		);
 		expect(res.success).toBe(true);
 		expect(res.message).toBe('Found 2 items: a, b');
 		const data = res.data as {
 			subAgentType: string;
 			toolsCalled: number;
 			rounds: number;
 			stopReason: string;
 		};
 		expect(data.subAgentType).toBe('research');
 		expect(data.toolsCalled).toBe(1);
 		expect(data.rounds).toBeGreaterThanOrEqual(2);
 		expect(parentDispatch).toHaveBeenCalledTimes(1);
 	});
 	it('tracks cumulative usage across multiple invocations', async () => {
 		const llm = new MockLlmClient();
 		// Two sub-agent runs, each reports usage.
 		for (let i = 0; i < 2; i++) {
 			(llm as unknown as { queue: unknown[] }).queue.push({
 				content: `summary-${i}`,
 				toolCalls: [],
 				finishReason: 'stop',
 				usage: { promptTokens: 100, completionTokens: 30, totalTokens: 130 },
 			});
 		}
 		const handler = createTaskToolHandler({
 			llm,
 			model: 'google/gemini-2.5-flash-lite',
 			parentDepth: 0,
 			parentTools,
 			parentOnToolCall: async () => ({ success: true, message: '' }),
 		});
 		await handler.handle(makeCall({ subagent_type: 'plan', description: 'a', prompt: 'one' }));
 		await handler.handle(makeCall({ subagent_type: 'plan', description: 'b', prompt: 'two' }));
 		expect(handler.invocationCount()).toBe(2);
 		const usage = handler.cumulativeUsage();
 		expect(usage.promptTokens).toBe(200);
 		expect(usage.completionTokens).toBe(60);
 		expect(usage.totalTokens).toBe(260);
 	});
 	it('counts zero usage if no successful sub-agent ran', async () => {
 		const handler = createTaskToolHandler({
 			llm: new MockLlmClient(),
 			model: 'x/y',
 			parentDepth: 0,
 			parentTools,
 			parentOnToolCall: async () => ({ success: true, message: '' }),
 		});
 		expect(handler.invocationCount()).toBe(0);
 		expect(handler.cumulativeUsage()).toEqual({
 			promptTokens: 0,
 			completionTokens: 0,
 			totalTokens: 0,
 		});
 	});
 	it('wraps sub-agent exceptions as structured ToolResult failures', async () => {
 		const llm = {
 			async complete() {
 				throw new Error('provider is down');
 			},
 		};
 		const handler = createTaskToolHandler({
 			llm,
 			model: 'x/y',
 			parentDepth: 0,
 			parentTools,
 			parentOnToolCall: async () => ({ success: true, message: '' }),
 		});
 		const res = await handler.handle(
 			makeCall({ subagent_type: 'general', description: 'x', prompt: 'y' })
 		);
 		expect(res.success).toBe(false);
 		expect(res.message).toContain('Sub-agent failed');
 		expect(res.message).toContain('provider is down');
 	});
 });
 // ─── Tool-routing through parent dispatcher ────────────────────────
 describe('createTaskToolHandler — tool routing', () => {
 	it('sub-agent tool calls route through parent dispatcher (policy/audit stays reused)', async () => {
 		const llm = new MockLlmClient()
 			.enqueueToolCalls([{ name: 'list_things', args: {} }])
 			.enqueueStop('summary');
 		let parentCalled = false;
 		const parentDispatch = async (_c: ToolCallRequest): Promise<ToolResult> => {
 			parentCalled = true;
 			return { success: true, message: 'from parent' };
 		};
 		const handler = createTaskToolHandler({
 			llm,
 			model: 'x/y',
 			parentDepth: 0,
 			parentTools,
 			parentOnToolCall: parentDispatch,
 		});
 		await handler.handle(makeCall({ subagent_type: 'research', description: 'd', prompt: 'p' }));
 		expect(parentCalled).toBe(true);
 	});
 });
--- a/packages/shared-ai/src/planner/task-tool.ts
+++ b/packages/shared-ai/src/planner/task-tool.ts
@ -0,0 +1,218 @@
 /**
 * LLM-facing wrapper for `runSubAgent`.
 *
 * Claude Code exposes sub-agent launching to the model as a `Task` tool
 * — the model writes `{ subagent_type, description, prompt }`, the
 * harness spins up the sub-agent and returns a single summary string as
 * the tool-result. This module provides the same surface, typed and
 * testable, ready to be plugged into any `runPlannerLoop` caller's
 * `tools` array and `onToolCall` dispatcher.
 *
 * Why this lives in shared-ai, not in `@mana/tool-registry`:
 *   - `task` is a **loop-internal control-flow tool**, not a user-data
 *     operation. It never writes to mana-sync, never carries a spaceId
 *     beyond the parent's, never appears in MCP.
 *   - Every caller of `runPlannerLoop` needs the same wiring: drop the
 *     schema into `tools`, branch `onToolCall` on `name === 'task'`,
 *     route to `taskToolHandler`. Registry would be overkill.
 *   - The `task` schema has a dynamically-filtered enum for
 *     `subagent_type` that depends on the caller's deployment — not a
 *     good fit for a static registry export.
 *
 * Telemetry: callers can read `lastRunUsage` from the factory's
 * returned context to attribute sub-agent tokens to their own budget
 * counters (mana-ai's per-agent daily cap, companion's session cost).
 */
 import type { ToolSchema } from '../tools/schemas';
 import { runSubAgent, MAX_SUB_AGENT_DEPTH } from './sub-agent';
 import type { LlmClient, ReminderChannel, TokenUsage, ToolCallRequest, ToolResult } from './loop';
 import type { SubAgentType } from './sub-agent';
 /**
 * Canonical tool name. Static so consumers can `if (call.name ===
 * TASK_TOOL_NAME)` in their onToolCall branch without importing the
 * whole schema.
 */
 export const TASK_TOOL_NAME = 'task';
 /**
 * Default ToolSchema for the `task` tool, compatible with
 * `runPlannerLoop`'s `tools` input. `defaultPolicy` is 'auto' because
 * the LLM may legitimately call this mid-reasoning — it is NOT a
 * destructive or user-visible write, it's a control-flow primitive.
 *
 * Consumers that want to restrict which SubAgentTypes are exposable
 * can clone this schema and narrow the `enum` on `subagent_type`
 * before dropping it into their `tools` array.
 */
 export const TASK_TOOL_SCHEMA: ToolSchema = {
 	name: TASK_TOOL_NAME,
 	module: '_agent',
 	description:
 		'Launch a context-isolated sub-agent to handle a focused task. ' +
 		'Use this when a sub-step would add a lot of noise to the main conversation (large search, detailed investigation, long planning). ' +
 		'The sub-agent runs with fresh conversation state and a restricted tool set, ' +
 		'then returns a single-string summary. You do NOT see its individual tool calls. ' +
 		'Cannot be called recursively — sub-agents can NOT launch further sub-agents.',
 	defaultPolicy: 'auto',
 	parameters: [
 		{
 			name: 'subagent_type',
 			type: 'string',
 			required: true,
 			description:
 				"Archetype: 'research' for read-only fact-finding, " +
 				"'plan' for a thinking pass with minimal tools, " +
 				"'general' for heterogeneous work including writes.",
 			enum: ['research', 'plan', 'general'] as const,
 		},
 		{
 			name: 'description',
 			type: 'string',
 			required: true,
 			description: 'Short title for logging (≤ 80 chars). Not shown to the sub-agent.',
 		},
 		{
 			name: 'prompt',
 			type: 'string',
 			required: true,
 			description:
 				'The actual task for the sub-agent. Be explicit about what you want in the returned summary.',
 		},
 	],
 } as const;
 /**
 * Zod-ish input validation. We stay lightweight so this module can
 * avoid a zod dependency — the loop already re-validates through the
 * tool-schema, and any parse error falls through as a tool-failure
 * that the LLM can react to.
 */
 function parseTaskArgs(
 	raw: unknown
 ): { type: SubAgentType; description: string; prompt: string } | string {
 	if (!raw || typeof raw !== 'object') return 'arguments must be an object';
 	const o = raw as Record<string, unknown>;
 	const type = o.subagent_type;
 	const description = o.description;
 	const prompt = o.prompt;
 	if (type !== 'research' && type !== 'plan' && type !== 'general') {
 		return `subagent_type must be research|plan|general, got ${JSON.stringify(type)}`;
 	}
 	if (typeof description !== 'string' || description.length === 0) {
 		return 'description is required';
 	}
 	if (typeof prompt !== 'string' || prompt.length === 0) {
 		return 'prompt is required';
 	}
 	return { type, description, prompt };
 }
 export interface TaskToolHandlerOptions {
 	readonly llm: LlmClient;
 	/** Model the sub-agent calls through. Callers typically route this
 	 *  to a cheaper tier (Haiku/flash-lite) since sub-agents are by
 	 *  construction short + summarisation-heavy. */
 	readonly model: string;
 	/** Current recursion depth in the parent loop. Pass 0 for a
 	 *  top-level call; the handler refuses at depth >= 1. */
 	readonly parentDepth: number;
 	/** Parent's full tool catalog. The handler filters down per
 	 *  subagent_type inside `runSubAgent`. */
 	readonly parentTools: readonly ToolSchema[];
 	/** The parent's own tool dispatcher. Sub-agent tool calls get
 	 *  routed through here — the handler wraps it so the parent's
 	 *  executor, policy gate, and audit trail are reused verbatim. */
 	readonly parentOnToolCall: (call: ToolCallRequest) => Promise<ToolResult>;
 	/** Optional reminder channel for the sub-agent. Usually narrower
 	 *  than the parent's. */
 	readonly reminderChannel?: ReminderChannel;
 }
 export interface TaskToolHandler {
 	/** The actual `onToolCall` branch. Returns a ToolResult whose
 	 *  `message` is the sub-agent's summary (or the failure reason). */
 	readonly handle: (call: ToolCallRequest) => Promise<ToolResult>;
 	/** Rolled-up usage from every sub-agent invocation so far, so the
 	 *  parent can attribute tokens to its budget. Zeros until the
 	 *  first call. */
 	readonly cumulativeUsage: () => TokenUsage;
 	/** How many sub-agents the parent has launched so far. Useful for
 	 *  metrics dashboards. */
 	readonly invocationCount: () => number;
 }
 /**
 * Factory: bind the handler to a parent context so the consumer's
 * `onToolCall` branch can just call `handler.handle(call)` without
 * re-wiring the llm/model/tools on every call.
 */
 export function createTaskToolHandler(opts: TaskToolHandlerOptions): TaskToolHandler {
 	let promptTokens = 0;
 	let completionTokens = 0;
 	let invocations = 0;
 	const handle = async (call: ToolCallRequest): Promise<ToolResult> => {
 		// Defence-in-depth: the primitive throws too, but returning a
 		// structured ToolResult lets the LLM see the rejection as
 		// regular tool-feedback instead of bubbling up an exception.
 		if (opts.parentDepth >= MAX_SUB_AGENT_DEPTH) {
 			return {
 				success: false,
 				message:
 					`Sub-Agents duerfen nicht verschachtelt werden. Parent-Depth ${opts.parentDepth} ` +
 					`>= MAX_SUB_AGENT_DEPTH ${MAX_SUB_AGENT_DEPTH}. ` +
 					`Fuehre die Aufgabe stattdessen im aktuellen Loop aus oder brich ab.`,
 			};
 		}
 		const parsed = parseTaskArgs(call.arguments);
 		if (typeof parsed === 'string') {
 			return { success: false, message: `Invalid task args: ${parsed}` };
 		}
 		try {
 			const result = await runSubAgent({
 				llm: opts.llm,
 				model: opts.model,
 				type: parsed.type,
 				task: parsed.prompt,
 				parentTools: opts.parentTools,
 				onToolCall: opts.parentOnToolCall,
 				parentDepth: opts.parentDepth,
 				reminderChannel: opts.reminderChannel,
 			});
 			promptTokens += result.usage.promptTokens;
 			completionTokens += result.usage.completionTokens;
 			invocations++;
 			return {
 				success: true,
 				message: result.summary,
 				data: {
 					subAgentType: result.type,
 					toolsCalled: result.rawResult.executedCalls.length,
 					rounds: result.rawResult.rounds,
 					stopReason: result.rawResult.stopReason,
 					usage: result.usage,
 					description: parsed.description,
 				},
 			};
 		} catch (err) {
 			const msg = err instanceof Error ? err.message : String(err);
 			return { success: false, message: `Sub-agent failed: ${msg}` };
 		}
 	};
 	return {
 		handle,
 		cumulativeUsage: () => ({
 			promptTokens,
 			completionTokens,
 			totalTokens: promptTokens + completionTokens,
 		}),
 		invocationCount: () => invocations,
 	};
 }