diff --git a/packages/shared-ai/src/index.ts b/packages/shared-ai/src/index.ts index 6444f6c3f..93361efce 100644 --- a/packages/shared-ai/src/index.ts +++ b/packages/shared-ai/src/index.ts @@ -90,6 +90,7 @@ export { DEFAULT_COMPACT_KEEP_RECENT, DEFAULT_COMPACT_MODEL, DEFAULT_COMPACT_THRESHOLD, + createTaskToolHandler, MAX_SUB_AGENT_DEPTH, MockLlmClient, parseCompactSummary, @@ -99,6 +100,8 @@ export { runSubAgent, shouldCompact, SubAgentRecursionError, + TASK_TOOL_NAME, + TASK_TOOL_SCHEMA, } from './planner'; export type { CompactHistoryOptions, @@ -107,6 +110,8 @@ export type { RunSubAgentInput, SubAgentResult, SubAgentType, + TaskToolHandler, + TaskToolHandlerOptions, } from './planner'; export { diff --git a/packages/shared-ai/src/planner/index.ts b/packages/shared-ai/src/planner/index.ts index fe10a2b89..abfcb5884 100644 --- a/packages/shared-ai/src/planner/index.ts +++ b/packages/shared-ai/src/planner/index.ts @@ -23,6 +23,8 @@ export { export type { CompactHistoryOptions, CompactHistoryResult, CompactSummary } from './compact'; export { MAX_SUB_AGENT_DEPTH, SubAgentRecursionError, runSubAgent } from './sub-agent'; export type { RunSubAgentInput, SubAgentResult, SubAgentType } from './sub-agent'; +export { TASK_TOOL_NAME, TASK_TOOL_SCHEMA, createTaskToolHandler } from './task-tool'; +export type { TaskToolHandler, TaskToolHandlerOptions } from './task-tool'; export { MockLlmClient } from './mock-llm'; export type { MockLlmTurn } from './mock-llm'; export type { diff --git a/packages/shared-ai/src/planner/task-tool.test.ts b/packages/shared-ai/src/planner/task-tool.test.ts new file mode 100644 index 000000000..7055fe18d --- /dev/null +++ b/packages/shared-ai/src/planner/task-tool.test.ts @@ -0,0 +1,264 @@ +import { describe, expect, it, vi } from 'vitest'; +import { TASK_TOOL_NAME, TASK_TOOL_SCHEMA, createTaskToolHandler } from './task-tool'; +import { MAX_SUB_AGENT_DEPTH } from './sub-agent'; +import { MockLlmClient } from './mock-llm'; +import type { ToolCallRequest, ToolResult } from './loop'; +import type { ToolSchema } from '../tools/schemas'; + +const parentTools: ToolSchema[] = [ + { + name: 'list_things', + module: 'test', + description: 'read', + defaultPolicy: 'auto', + parameters: [], + }, + { + name: 'create_thing', + module: 'test', + description: 'write', + defaultPolicy: 'propose', + parameters: [{ name: 'title', type: 'string', description: 't', required: true }], + }, +]; + +function makeCall(args: Record): ToolCallRequest { + return { id: 'tc-1', name: TASK_TOOL_NAME, arguments: args }; +} + +// ─── Schema shape ────────────────────────────────────────────────── + +describe('TASK_TOOL_SCHEMA', () => { + it('is named "task"', () => { + expect(TASK_TOOL_SCHEMA.name).toBe('task'); + expect(TASK_TOOL_NAME).toBe('task'); + }); + + it('carries subagent_type enum with research/plan/general', () => { + const typeParam = TASK_TOOL_SCHEMA.parameters.find((p) => p.name === 'subagent_type'); + expect(typeParam).toBeDefined(); + expect(typeParam!.enum).toEqual(['research', 'plan', 'general']); + }); + + it('requires description + prompt + subagent_type', () => { + const required = TASK_TOOL_SCHEMA.parameters.filter((p) => p.required).map((p) => p.name); + expect(required).toEqual(['subagent_type', 'description', 'prompt']); + }); + + it('defaultPolicy is auto (control-flow primitive, not a write)', () => { + expect(TASK_TOOL_SCHEMA.defaultPolicy).toBe('auto'); + }); +}); + +// ─── Recursion rejection ─────────────────────────────────────────── + +describe('createTaskToolHandler — recursion', () => { + it('refuses when parentDepth is at the cap (structured error, not throw)', async () => { + const handler = createTaskToolHandler({ + llm: new MockLlmClient(), + model: 'x/y', + parentDepth: MAX_SUB_AGENT_DEPTH, + parentTools, + parentOnToolCall: async () => ({ success: true, message: '' }), + }); + + const res = await handler.handle( + makeCall({ subagent_type: 'research', description: 'nested', prompt: 'do it' }) + ); + expect(res.success).toBe(false); + expect(res.message).toContain('nicht verschachtelt'); + }); +}); + +// ─── Input validation ────────────────────────────────────────────── + +describe('createTaskToolHandler — argument validation', () => { + function make() { + return createTaskToolHandler({ + llm: new MockLlmClient(), + model: 'x/y', + parentDepth: 0, + parentTools, + parentOnToolCall: async () => ({ success: true, message: '' }), + }); + } + + it('rejects non-object args', async () => { + const res = await make().handle({ + id: 't', + name: 'task', + arguments: null as unknown as Record, + }); + expect(res.success).toBe(false); + expect(res.message).toContain('object'); + }); + + it('rejects invalid subagent_type', async () => { + const res = await make().handle( + makeCall({ subagent_type: 'evil', description: 'x', prompt: 'y' }) + ); + expect(res.success).toBe(false); + expect(res.message).toContain('research|plan|general'); + }); + + it('rejects empty description', async () => { + const res = await make().handle( + makeCall({ subagent_type: 'research', description: '', prompt: 'y' }) + ); + expect(res.success).toBe(false); + expect(res.message).toContain('description'); + }); + + it('rejects empty prompt', async () => { + const res = await make().handle( + makeCall({ subagent_type: 'research', description: 'x', prompt: '' }) + ); + expect(res.success).toBe(false); + expect(res.message).toContain('prompt'); + }); +}); + +// ─── Happy path ──────────────────────────────────────────────────── + +describe('createTaskToolHandler — happy path', () => { + it('spawns a sub-agent and returns its summary as ToolResult.message', async () => { + const llm = new MockLlmClient() + .enqueueToolCalls([{ name: 'list_things', args: {} }]) + .enqueueStop('Found 2 items: a, b'); + + const parentDispatch = vi.fn( + async (_c: ToolCallRequest): Promise => ({ + success: true, + data: ['a', 'b'], + message: '2 items', + }) + ); + + const handler = createTaskToolHandler({ + llm, + model: 'google/gemini-2.5-flash-lite', + parentDepth: 0, + parentTools, + parentOnToolCall: parentDispatch, + }); + + const res = await handler.handle( + makeCall({ + subagent_type: 'research', + description: 'scan things', + prompt: 'list everything and report back', + }) + ); + + expect(res.success).toBe(true); + expect(res.message).toBe('Found 2 items: a, b'); + + const data = res.data as { + subAgentType: string; + toolsCalled: number; + rounds: number; + stopReason: string; + }; + expect(data.subAgentType).toBe('research'); + expect(data.toolsCalled).toBe(1); + expect(data.rounds).toBeGreaterThanOrEqual(2); + expect(parentDispatch).toHaveBeenCalledTimes(1); + }); + + it('tracks cumulative usage across multiple invocations', async () => { + const llm = new MockLlmClient(); + // Two sub-agent runs, each reports usage. + for (let i = 0; i < 2; i++) { + (llm as unknown as { queue: unknown[] }).queue.push({ + content: `summary-${i}`, + toolCalls: [], + finishReason: 'stop', + usage: { promptTokens: 100, completionTokens: 30, totalTokens: 130 }, + }); + } + + const handler = createTaskToolHandler({ + llm, + model: 'google/gemini-2.5-flash-lite', + parentDepth: 0, + parentTools, + parentOnToolCall: async () => ({ success: true, message: '' }), + }); + + await handler.handle(makeCall({ subagent_type: 'plan', description: 'a', prompt: 'one' })); + await handler.handle(makeCall({ subagent_type: 'plan', description: 'b', prompt: 'two' })); + + expect(handler.invocationCount()).toBe(2); + const usage = handler.cumulativeUsage(); + expect(usage.promptTokens).toBe(200); + expect(usage.completionTokens).toBe(60); + expect(usage.totalTokens).toBe(260); + }); + + it('counts zero usage if no successful sub-agent ran', async () => { + const handler = createTaskToolHandler({ + llm: new MockLlmClient(), + model: 'x/y', + parentDepth: 0, + parentTools, + parentOnToolCall: async () => ({ success: true, message: '' }), + }); + expect(handler.invocationCount()).toBe(0); + expect(handler.cumulativeUsage()).toEqual({ + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + }); + }); + + it('wraps sub-agent exceptions as structured ToolResult failures', async () => { + const llm = { + async complete() { + throw new Error('provider is down'); + }, + }; + + const handler = createTaskToolHandler({ + llm, + model: 'x/y', + parentDepth: 0, + parentTools, + parentOnToolCall: async () => ({ success: true, message: '' }), + }); + + const res = await handler.handle( + makeCall({ subagent_type: 'general', description: 'x', prompt: 'y' }) + ); + expect(res.success).toBe(false); + expect(res.message).toContain('Sub-agent failed'); + expect(res.message).toContain('provider is down'); + }); +}); + +// ─── Tool-routing through parent dispatcher ──────────────────────── + +describe('createTaskToolHandler — tool routing', () => { + it('sub-agent tool calls route through parent dispatcher (policy/audit stays reused)', async () => { + const llm = new MockLlmClient() + .enqueueToolCalls([{ name: 'list_things', args: {} }]) + .enqueueStop('summary'); + + let parentCalled = false; + const parentDispatch = async (_c: ToolCallRequest): Promise => { + parentCalled = true; + return { success: true, message: 'from parent' }; + }; + + const handler = createTaskToolHandler({ + llm, + model: 'x/y', + parentDepth: 0, + parentTools, + parentOnToolCall: parentDispatch, + }); + + await handler.handle(makeCall({ subagent_type: 'research', description: 'd', prompt: 'p' })); + + expect(parentCalled).toBe(true); + }); +}); diff --git a/packages/shared-ai/src/planner/task-tool.ts b/packages/shared-ai/src/planner/task-tool.ts new file mode 100644 index 000000000..36f0b8291 --- /dev/null +++ b/packages/shared-ai/src/planner/task-tool.ts @@ -0,0 +1,218 @@ +/** + * LLM-facing wrapper for `runSubAgent`. + * + * Claude Code exposes sub-agent launching to the model as a `Task` tool + * — the model writes `{ subagent_type, description, prompt }`, the + * harness spins up the sub-agent and returns a single summary string as + * the tool-result. This module provides the same surface, typed and + * testable, ready to be plugged into any `runPlannerLoop` caller's + * `tools` array and `onToolCall` dispatcher. + * + * Why this lives in shared-ai, not in `@mana/tool-registry`: + * - `task` is a **loop-internal control-flow tool**, not a user-data + * operation. It never writes to mana-sync, never carries a spaceId + * beyond the parent's, never appears in MCP. + * - Every caller of `runPlannerLoop` needs the same wiring: drop the + * schema into `tools`, branch `onToolCall` on `name === 'task'`, + * route to `taskToolHandler`. Registry would be overkill. + * - The `task` schema has a dynamically-filtered enum for + * `subagent_type` that depends on the caller's deployment — not a + * good fit for a static registry export. + * + * Telemetry: callers can read `lastRunUsage` from the factory's + * returned context to attribute sub-agent tokens to their own budget + * counters (mana-ai's per-agent daily cap, companion's session cost). + */ + +import type { ToolSchema } from '../tools/schemas'; +import { runSubAgent, MAX_SUB_AGENT_DEPTH } from './sub-agent'; +import type { LlmClient, ReminderChannel, TokenUsage, ToolCallRequest, ToolResult } from './loop'; +import type { SubAgentType } from './sub-agent'; + +/** + * Canonical tool name. Static so consumers can `if (call.name === + * TASK_TOOL_NAME)` in their onToolCall branch without importing the + * whole schema. + */ +export const TASK_TOOL_NAME = 'task'; + +/** + * Default ToolSchema for the `task` tool, compatible with + * `runPlannerLoop`'s `tools` input. `defaultPolicy` is 'auto' because + * the LLM may legitimately call this mid-reasoning — it is NOT a + * destructive or user-visible write, it's a control-flow primitive. + * + * Consumers that want to restrict which SubAgentTypes are exposable + * can clone this schema and narrow the `enum` on `subagent_type` + * before dropping it into their `tools` array. + */ +export const TASK_TOOL_SCHEMA: ToolSchema = { + name: TASK_TOOL_NAME, + module: '_agent', + description: + 'Launch a context-isolated sub-agent to handle a focused task. ' + + 'Use this when a sub-step would add a lot of noise to the main conversation (large search, detailed investigation, long planning). ' + + 'The sub-agent runs with fresh conversation state and a restricted tool set, ' + + 'then returns a single-string summary. You do NOT see its individual tool calls. ' + + 'Cannot be called recursively — sub-agents can NOT launch further sub-agents.', + defaultPolicy: 'auto', + parameters: [ + { + name: 'subagent_type', + type: 'string', + required: true, + description: + "Archetype: 'research' for read-only fact-finding, " + + "'plan' for a thinking pass with minimal tools, " + + "'general' for heterogeneous work including writes.", + enum: ['research', 'plan', 'general'] as const, + }, + { + name: 'description', + type: 'string', + required: true, + description: 'Short title for logging (≤ 80 chars). Not shown to the sub-agent.', + }, + { + name: 'prompt', + type: 'string', + required: true, + description: + 'The actual task for the sub-agent. Be explicit about what you want in the returned summary.', + }, + ], +} as const; + +/** + * Zod-ish input validation. We stay lightweight so this module can + * avoid a zod dependency — the loop already re-validates through the + * tool-schema, and any parse error falls through as a tool-failure + * that the LLM can react to. + */ +function parseTaskArgs( + raw: unknown +): { type: SubAgentType; description: string; prompt: string } | string { + if (!raw || typeof raw !== 'object') return 'arguments must be an object'; + const o = raw as Record; + const type = o.subagent_type; + const description = o.description; + const prompt = o.prompt; + if (type !== 'research' && type !== 'plan' && type !== 'general') { + return `subagent_type must be research|plan|general, got ${JSON.stringify(type)}`; + } + if (typeof description !== 'string' || description.length === 0) { + return 'description is required'; + } + if (typeof prompt !== 'string' || prompt.length === 0) { + return 'prompt is required'; + } + return { type, description, prompt }; +} + +export interface TaskToolHandlerOptions { + readonly llm: LlmClient; + /** Model the sub-agent calls through. Callers typically route this + * to a cheaper tier (Haiku/flash-lite) since sub-agents are by + * construction short + summarisation-heavy. */ + readonly model: string; + /** Current recursion depth in the parent loop. Pass 0 for a + * top-level call; the handler refuses at depth >= 1. */ + readonly parentDepth: number; + /** Parent's full tool catalog. The handler filters down per + * subagent_type inside `runSubAgent`. */ + readonly parentTools: readonly ToolSchema[]; + /** The parent's own tool dispatcher. Sub-agent tool calls get + * routed through here — the handler wraps it so the parent's + * executor, policy gate, and audit trail are reused verbatim. */ + readonly parentOnToolCall: (call: ToolCallRequest) => Promise; + /** Optional reminder channel for the sub-agent. Usually narrower + * than the parent's. */ + readonly reminderChannel?: ReminderChannel; +} + +export interface TaskToolHandler { + /** The actual `onToolCall` branch. Returns a ToolResult whose + * `message` is the sub-agent's summary (or the failure reason). */ + readonly handle: (call: ToolCallRequest) => Promise; + /** Rolled-up usage from every sub-agent invocation so far, so the + * parent can attribute tokens to its budget. Zeros until the + * first call. */ + readonly cumulativeUsage: () => TokenUsage; + /** How many sub-agents the parent has launched so far. Useful for + * metrics dashboards. */ + readonly invocationCount: () => number; +} + +/** + * Factory: bind the handler to a parent context so the consumer's + * `onToolCall` branch can just call `handler.handle(call)` without + * re-wiring the llm/model/tools on every call. + */ +export function createTaskToolHandler(opts: TaskToolHandlerOptions): TaskToolHandler { + let promptTokens = 0; + let completionTokens = 0; + let invocations = 0; + + const handle = async (call: ToolCallRequest): Promise => { + // Defence-in-depth: the primitive throws too, but returning a + // structured ToolResult lets the LLM see the rejection as + // regular tool-feedback instead of bubbling up an exception. + if (opts.parentDepth >= MAX_SUB_AGENT_DEPTH) { + return { + success: false, + message: + `Sub-Agents duerfen nicht verschachtelt werden. Parent-Depth ${opts.parentDepth} ` + + `>= MAX_SUB_AGENT_DEPTH ${MAX_SUB_AGENT_DEPTH}. ` + + `Fuehre die Aufgabe stattdessen im aktuellen Loop aus oder brich ab.`, + }; + } + + const parsed = parseTaskArgs(call.arguments); + if (typeof parsed === 'string') { + return { success: false, message: `Invalid task args: ${parsed}` }; + } + + try { + const result = await runSubAgent({ + llm: opts.llm, + model: opts.model, + type: parsed.type, + task: parsed.prompt, + parentTools: opts.parentTools, + onToolCall: opts.parentOnToolCall, + parentDepth: opts.parentDepth, + reminderChannel: opts.reminderChannel, + }); + + promptTokens += result.usage.promptTokens; + completionTokens += result.usage.completionTokens; + invocations++; + + return { + success: true, + message: result.summary, + data: { + subAgentType: result.type, + toolsCalled: result.rawResult.executedCalls.length, + rounds: result.rawResult.rounds, + stopReason: result.rawResult.stopReason, + usage: result.usage, + description: parsed.description, + }, + }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { success: false, message: `Sub-agent failed: ${msg}` }; + } + }; + + return { + handle, + cumulativeUsage: () => ({ + promptTokens, + completionTokens, + totalTokens: promptTokens + completionTokens, + }), + invocationCount: () => invocations, + }; +}