From 66b7e08df21cf3eab049e471507b648475bf54ff Mon Sep 17 00:00:00 2001 From: Till JS Date: Thu, 23 Apr 2026 18:59:05 +0200 Subject: [PATCH] =?UTF-8?q?feat(shared-ai):=20runSubAgent()=20primitive=20?= =?UTF-8?q?=E2=80=94=20Claude-Code=20I2A=20pattern=20(M3.1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New packages/shared-ai/src/planner/sub-agent.ts implementing the "one level deep, fresh messages, restricted tools, single-string return" sub-agent contract from Claude Code's KN5/I2A launcher. Four invariants enforced at the primitive level: 1. FRESH messages[] — parent's history never leaks in. The sub-agent only sees its own system prompt + the task description. Hundreds of scanned files stay inside the sub-agent. 2. RESTRICTED tool-whitelist — parent's full catalog is filtered per SubAgentType ('research' = auto-policy only, 'general' = everything, 'plan' = auto-policy + 3-round cap). Custom filter overrides the type default. 3. SINGLE RETURN VALUE — sub-agent returns summary:string for the parent to render as task-tool-result. Individual tool calls stay in rawResult for debug capture but never cross the boundary. 4. ONE LEVEL DEEP — MAX_SUB_AGENT_DEPTH = 1. parentDepth >= 1 throws SubAgentRecursionError; the consumer task-tool handler will also check, this is defense-in-depth. Model is required (no default) — routing to a cheaper tier like the compactor does is an explicit decision, not a sneaky default. Belt-and-suspenders wrapper on onToolCall rejects any tool call whose name isn't in the whitelist, even if the LLM fabricates one. 14 new tests covering recursion guard, tool filtering per type, custom filter, whitelist rejection, fresh-messages isolation, usage roll-up, default summary on max-rounds, type-specific system prompt, system-prompt override, and end-to-end tool-call -> result -> summary. 93 shared-ai tests green total (was 79). M3.2 (task tool in registry) and M3.3 (consumer wiring) follow. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../mana-tool-registry/src/modules/index.ts | 3 + .../src/modules/wardrobe.ts | 572 ++++++++++++++++++ packages/mana-tool-registry/src/types.ts | 4 +- packages/shared-ai/src/index.ts | 12 +- packages/shared-ai/src/planner/index.ts | 2 + .../shared-ai/src/planner/sub-agent.test.ts | 288 +++++++++ packages/shared-ai/src/planner/sub-agent.ts | 259 ++++++++ 7 files changed, 1138 insertions(+), 2 deletions(-) create mode 100644 packages/mana-tool-registry/src/modules/wardrobe.ts create mode 100644 packages/shared-ai/src/planner/sub-agent.test.ts create mode 100644 packages/shared-ai/src/planner/sub-agent.ts diff --git a/packages/mana-tool-registry/src/modules/index.ts b/packages/mana-tool-registry/src/modules/index.ts index d37f959c6..14c7ee758 100644 --- a/packages/mana-tool-registry/src/modules/index.ts +++ b/packages/mana-tool-registry/src/modules/index.ts @@ -17,6 +17,7 @@ import { registerMoodTools } from './mood.ts'; import { registerNotesTools } from './notes.ts'; import { registerSpacesTools } from './spaces.ts'; import { registerTodoTools } from './todo.ts'; +import { registerWardrobeTools } from './wardrobe.ts'; export function registerAllModules(): void { registerHabitsTools(); @@ -26,6 +27,7 @@ export function registerAllModules(): void { registerNotesTools(); registerSpacesTools(); registerTodoTools(); + registerWardrobeTools(); } export { @@ -36,4 +38,5 @@ export { registerNotesTools, registerSpacesTools, registerTodoTools, + registerWardrobeTools, }; diff --git a/packages/mana-tool-registry/src/modules/wardrobe.ts b/packages/mana-tool-registry/src/modules/wardrobe.ts new file mode 100644 index 000000000..6919751a2 --- /dev/null +++ b/packages/mana-tool-registry/src/modules/wardrobe.ts @@ -0,0 +1,572 @@ +/** + * Wardrobe — tools for agents to browse a user's digital closet, + * compose outfits, and run try-on generations. Four tools: + * + * - wardrobe.listGarments (read) — what do I own, filtered by + * category / tags + * - wardrobe.listOutfits (read) — which combinations exist, + * filtered by occasion / favorite + * - wardrobe.createOutfit (write) — compose a named outfit from + * garment ids + * - wardrobe.tryOn (write) — render the user wearing the + * outfit; wraps the existing + * picture/generate-with-reference + * endpoint with resolved refs + * + * Space scope: garments and outfits live in the active space. meImages + * (the face/body references needed for try-on) likewise space-scoped + * after the v40 migration. Everything in this module filters client- + * side (after mana-sync pull) on `row.spaceId === ctx.spaceId`, matching + * the webapp's scopedForModule behaviour. + * + * Plan: docs/plans/wardrobe-module.md M5. + */ + +import { z } from 'zod'; +import { decryptRecordFields, encryptRecordFields } from '@mana/shared-crypto'; +import { pullAll, pushInsert } from '../sync-client.ts'; +import { registerTool } from '../registry.ts'; +import type { ToolContext, ToolSpec } from '../types.ts'; + +const GARMENTS_APP_ID = 'wardrobe'; +const GARMENTS_TABLE = 'wardrobeGarments'; +const GARMENT_ENCRYPTED_FIELDS = [ + 'name', + 'brand', + 'color', + 'size', + 'material', + 'tags', + 'notes', +] as const; + +const OUTFITS_APP_ID = 'wardrobe'; +const OUTFITS_TABLE = 'wardrobeOutfits'; +const OUTFIT_ENCRYPTED_FIELDS = ['name', 'description', 'tags'] as const; + +const ME_APP_ID = 'profile'; +const ME_TABLE = 'meImages'; +const ME_ENCRYPTED_FIELDS = ['label', 'tags'] as const; + +const SYNC_URL = () => process.env.MANA_SYNC_URL ?? 'http://localhost:3050'; +const PICTURE_API_URL = () => process.env.MANA_API_URL ?? 'http://localhost:3060'; +const CLIENT_ID = () => process.env.MANA_MCP_CLIENT_ID ?? 'mana-mcp'; + +function syncCfg(ctx: ToolContext) { + return { baseUrl: SYNC_URL(), jwt: ctx.jwt, clientId: CLIENT_ID() }; +} + +// ─── Domain shapes (zod) ────────────────────────────────────────── + +const garmentCategory = z.enum([ + 'top', + 'bottom', + 'dress', + 'outerwear', + 'shoes', + 'bag', + 'accessory', + 'glasses', + 'jewelry', + 'hat', + 'other', +]); +type GarmentCategory = z.infer; + +const FACE_ONLY_CATEGORIES: ReadonlySet = new Set([ + 'accessory', + 'glasses', + 'jewelry', + 'hat', +]); + +const outfitOccasion = z.enum([ + 'casual', + 'work', + 'formal', + 'workout', + 'date', + 'travel', + 'event', + 'sleep', + 'other', +]); + +const garmentSchema = z.object({ + id: z.string(), + name: z.string(), + category: garmentCategory, + mediaIds: z.array(z.string()), + brand: z.string().nullable(), + color: z.string().nullable(), + size: z.string().nullable(), + material: z.string().nullable(), + tags: z.array(z.string()), + notes: z.string().nullable(), +}); + +const outfitSchema = z.object({ + id: z.string(), + name: z.string(), + description: z.string().nullable(), + garmentIds: z.array(z.string()), + occasion: outfitOccasion.nullable(), + tags: z.array(z.string()), + isFavorite: z.boolean(), +}); + +// Raw row shapes — fields beyond what we consume are tolerated. +interface RawGarmentRow { + id?: string; + name?: string; + category?: string; + mediaIds?: string[]; + brand?: string | null; + color?: string | null; + size?: string | null; + material?: string | null; + tags?: string[] | null; + notes?: string | null; + isArchived?: boolean; + deletedAt?: string | null; + spaceId?: string | null; +} + +interface RawOutfitRow { + id?: string; + name?: string; + description?: string | null; + garmentIds?: string[]; + occasion?: string | null; + tags?: string[] | null; + isFavorite?: boolean; + isArchived?: boolean; + deletedAt?: string | null; + spaceId?: string | null; +} + +interface RawMeImageRow { + id?: string; + mediaId?: string; + primaryFor?: string | null; + deletedAt?: string | null; + spaceId?: string | null; +} + +// ─── wardrobe.listGarments ──────────────────────────────────────── + +const listGarmentsInput = z.object({ + category: garmentCategory.optional(), + /** Intersection filter: rows must contain EVERY tag listed. Empty = no filter. */ + tags: z.array(z.string()).max(10).default([]), + limit: z.number().int().positive().max(200).default(50), +}); + +const listGarmentsOutput = z.object({ + garments: z.array(garmentSchema), +}); + +export const wardrobeListGarments: ToolSpec = + { + name: 'wardrobe.listGarments', + module: 'wardrobe', + scope: 'user-space', + policyHint: 'read', + description: + "List the caller's garments in the active space. Filter by `category` (closed enum) and/or `tags` (intersection — every listed tag must be present). Returns at most `limit` rows, newest first. Archived + soft-deleted items are excluded.", + input: listGarmentsInput, + output: listGarmentsOutput, + encryptedFields: { table: GARMENTS_TABLE, fields: [...GARMENT_ENCRYPTED_FIELDS] }, + async handler(input, ctx) { + const key = await ctx.getMasterKey(); + const res = await pullAll(syncCfg(ctx), GARMENTS_APP_ID, GARMENTS_TABLE); + const alive = res.changes + .filter((c) => c.op !== 'delete' && c.data) + .map((c) => c.data as RawGarmentRow) + .filter((row) => !row.deletedAt && !row.isArchived) + .filter((row) => row.spaceId === ctx.spaceId); + + const decrypted = (await Promise.all( + alive.map((row) => + decryptRecordFields(row as unknown as Record, GARMENT_ENCRYPTED_FIELDS, key) + ) + )) as unknown as RawGarmentRow[]; + + const filtered = decrypted + .filter((row): row is RawGarmentRow & { id: string; name: string; category: string } => + Boolean(row.id && row.name && row.category) + ) + .filter((row) => !input.category || row.category === input.category) + .filter((row) => { + if (input.tags.length === 0) return true; + const rowTags = new Set(row.tags ?? []); + return input.tags.every((t) => rowTags.has(t)); + }) + .slice(0, input.limit); + + const garments = filtered.map((row) => ({ + id: row.id, + name: row.name, + category: row.category as GarmentCategory, + mediaIds: row.mediaIds ?? [], + brand: row.brand ?? null, + color: row.color ?? null, + size: row.size ?? null, + material: row.material ?? null, + tags: row.tags ?? [], + notes: row.notes ?? null, + })); + + ctx.logger.info('wardrobe.listGarments', { + count: garments.length, + category: input.category ?? 'all', + }); + + return { garments }; + }, + }; + +// ─── wardrobe.listOutfits ───────────────────────────────────────── + +const listOutfitsInput = z.object({ + occasion: outfitOccasion.optional(), + favoriteOnly: z.boolean().default(false), + limit: z.number().int().positive().max(200).default(50), +}); + +const listOutfitsOutput = z.object({ + outfits: z.array(outfitSchema), +}); + +export const wardrobeListOutfits: ToolSpec = { + name: 'wardrobe.listOutfits', + module: 'wardrobe', + scope: 'user-space', + policyHint: 'read', + description: + "List the caller's outfits in the active space. Filter by `occasion` and/or `favoriteOnly`. The returned rows include garmentIds — use `wardrobe.listGarments` to resolve them to full rows when you need more than ids.", + input: listOutfitsInput, + output: listOutfitsOutput, + encryptedFields: { table: OUTFITS_TABLE, fields: [...OUTFIT_ENCRYPTED_FIELDS] }, + async handler(input, ctx) { + const key = await ctx.getMasterKey(); + const res = await pullAll(syncCfg(ctx), OUTFITS_APP_ID, OUTFITS_TABLE); + const alive = res.changes + .filter((c) => c.op !== 'delete' && c.data) + .map((c) => c.data as RawOutfitRow) + .filter((row) => !row.deletedAt && !row.isArchived) + .filter((row) => row.spaceId === ctx.spaceId); + + const decrypted = (await Promise.all( + alive.map((row) => + decryptRecordFields(row as unknown as Record, OUTFIT_ENCRYPTED_FIELDS, key) + ) + )) as unknown as RawOutfitRow[]; + + const filtered = decrypted + .filter((row): row is RawOutfitRow & { id: string; name: string } => + Boolean(row.id && row.name) + ) + .filter((row) => !input.occasion || row.occasion === input.occasion) + .filter((row) => !input.favoriteOnly || row.isFavorite === true) + .slice(0, input.limit); + + const outfits = filtered.map((row) => ({ + id: row.id, + name: row.name, + description: row.description ?? null, + garmentIds: row.garmentIds ?? [], + occasion: (row.occasion ?? null) as z.infer | null, + tags: row.tags ?? [], + isFavorite: row.isFavorite === true, + })); + + ctx.logger.info('wardrobe.listOutfits', { + count: outfits.length, + occasion: input.occasion ?? 'all', + favoriteOnly: input.favoriteOnly, + }); + + return { outfits }; + }, +}; + +// ─── wardrobe.createOutfit ──────────────────────────────────────── + +const createOutfitInput = z.object({ + name: z.string().min(1).max(200), + garmentIds: z.array(z.string()).min(1).max(16), + description: z.string().max(2000).nullable().default(null), + occasion: outfitOccasion.nullable().default(null), + tags: z.array(z.string()).max(20).default([]), +}); + +const createOutfitOutput = z.object({ + outfit: outfitSchema, +}); + +export const wardrobeCreateOutfit: ToolSpec = + { + name: 'wardrobe.createOutfit', + module: 'wardrobe', + scope: 'user-space', + policyHint: 'write', + description: + "Compose a new outfit in the active space. `garmentIds` must reference garments the caller owns in the same space — the server will persist whatever you pass (there's no cross-space validation here), so call `wardrobe.listGarments` first to confirm the ids.", + input: createOutfitInput, + output: createOutfitOutput, + encryptedFields: { table: OUTFITS_TABLE, fields: [...OUTFIT_ENCRYPTED_FIELDS] }, + async handler(input, ctx) { + const key = await ctx.getMasterKey(); + const id = crypto.randomUUID(); + const plaintext = { + id, + name: input.name, + description: input.description, + garmentIds: input.garmentIds, + occasion: input.occasion, + tags: input.tags, + isFavorite: false, + }; + + const encrypted = await encryptRecordFields( + plaintext as unknown as Record, + OUTFIT_ENCRYPTED_FIELDS, + key + ); + + await pushInsert(syncCfg(ctx), OUTFITS_APP_ID, { + table: OUTFITS_TABLE, + id, + spaceId: ctx.spaceId, + data: encrypted, + }); + + ctx.logger.info('wardrobe.createOutfit', { + outfitId: id, + garmentCount: input.garmentIds.length, + occasion: input.occasion ?? 'none', + }); + + return { + outfit: { + id, + name: input.name, + description: input.description, + garmentIds: input.garmentIds, + occasion: input.occasion, + tags: input.tags, + isFavorite: false, + }, + }; + }, + }; + +// ─── wardrobe.tryOn ─────────────────────────────────────────────── + +const tryOnInput = z.object({ + outfitId: z.string(), + /** Optional override; default is composed from the outfit's name + occasion. */ + prompt: z.string().max(2000).optional(), + /** + * Force accessory-only mode (face-only render, square 1024×1024). + * Auto-detected when every garment in the outfit is in the face- + * only category set — pass true explicitly to override on mixed + * outfits (rare). + */ + accessoryOnly: z.boolean().optional(), + quality: z.enum(['low', 'medium', 'high']).default('medium'), +}); + +const tryOnOutput = z.object({ + imageUrl: z.string(), + mediaId: z.string(), + prompt: z.string(), + model: z.string(), + referenceMediaIds: z.array(z.string()), + mode: z.literal('edit'), +}); + +export const wardrobeTryOn: ToolSpec = { + name: 'wardrobe.tryOn', + module: 'wardrobe', + // `write` rather than `destructive`: the result is additive (a new + // image in the Picture gallery) and credits are consumed at the + // standard picture-generation tarif. No existing data is overwritten. + scope: 'user-space', + policyHint: 'write', + description: + "Render the caller wearing the outfit using OpenAI gpt-image-2. Resolves the active space's primary face-ref (and body-ref when the outfit isn't accessory-only) from meImages, combines them with the outfit's garment photos, and calls the picture-generate-with-reference endpoint. Returns the generated image's URL + mana-media id. Consumes credits at the same tarif as text-to-image (medium = 10). Does NOT persist the result into the Picture gallery from here — that's deferred to avoid double-writes when a user is also on the page; treat this tool as a preview.", + input: tryOnInput, + output: tryOnOutput, + async handler(input, ctx) { + // 1. Fetch outfit + garments + meImages, decrypt what's needed. + const key = await ctx.getMasterKey(); + + const outfitsRes = await pullAll( + syncCfg(ctx), + OUTFITS_APP_ID, + OUTFITS_TABLE + ); + const outfit = outfitsRes.changes + .filter((c) => c.op !== 'delete' && c.data) + .map((c) => c.data as RawOutfitRow) + .find( + (row) => + row.id === input.outfitId && !row.deletedAt && row.spaceId === ctx.spaceId + ); + if (!outfit) { + throw new Error(`Outfit ${input.outfitId} not found in the active space`); + } + + const decryptedOutfit = (await decryptRecordFields( + outfit as unknown as Record, + OUTFIT_ENCRYPTED_FIELDS, + key + )) as unknown as RawOutfitRow; + + const garmentIds = decryptedOutfit.garmentIds ?? []; + if (garmentIds.length === 0) { + throw new Error('Outfit has no garments'); + } + + const garmentsRes = await pullAll( + syncCfg(ctx), + GARMENTS_APP_ID, + GARMENTS_TABLE + ); + const garmentSet = new Set(garmentIds); + const relevantGarments = garmentsRes.changes + .filter((c) => c.op !== 'delete' && c.data) + .map((c) => c.data as RawGarmentRow) + .filter( + (row) => + row.id && + garmentSet.has(row.id) && + !row.deletedAt && + row.spaceId === ctx.spaceId + ); + if (relevantGarments.length === 0) { + throw new Error( + 'None of the outfit garments exist in the active space (moved or deleted?)' + ); + } + + // Garment metadata we need (category, mediaIds) is plaintext; no + // decrypt round-trip needed for ref composition. + const garmentMediaIds = relevantGarments + .map((g) => g.mediaIds?.[0]) + .filter((id): id is string => Boolean(id)); + if (garmentMediaIds.length === 0) { + throw new Error('None of the outfit garments have a primary photo'); + } + + const meRes = await pullAll(syncCfg(ctx), ME_APP_ID, ME_TABLE); + const liveMeImages = meRes.changes + .filter((c) => c.op !== 'delete' && c.data) + .map((c) => c.data as RawMeImageRow) + .filter((row) => !row.deletedAt && row.spaceId === ctx.spaceId); + + const faceRef = liveMeImages.find((row) => row.primaryFor === 'face-ref'); + const bodyRef = liveMeImages.find((row) => row.primaryFor === 'body-ref'); + + if (!faceRef?.mediaId) { + throw new Error( + 'No primary face-ref meImage in the active space. Upload one via /profile/me-images.' + ); + } + + // 2. Accessory-only detection. + const allFaceOnly = relevantGarments.every((g) => + FACE_ONLY_CATEGORIES.has((g.category ?? 'other') as GarmentCategory) + ); + const accessoryOnly = input.accessoryOnly ?? allFaceOnly; + + if (!accessoryOnly && !bodyRef?.mediaId) { + throw new Error( + 'No primary body-ref meImage in the active space. Upload a fullbody photo via /profile/me-images, or pass accessoryOnly=true if the outfit is face-only.' + ); + } + + // 3. Compose reference list respecting the 8-slot server cap. + const referenceMediaIds: string[] = [faceRef.mediaId]; + if (!accessoryOnly && bodyRef?.mediaId) referenceMediaIds.push(bodyRef.mediaId); + for (const id of garmentMediaIds) { + if (referenceMediaIds.length >= 8) break; + referenceMediaIds.push(id); + } + + // 4. Compose prompt if none given. + const outfitName = decryptedOutfit.name ?? 'Outfit'; + const effectivePrompt = + input.prompt?.trim() || + (accessoryOnly + ? `Fotorealistisches Portrait von mir mit ${outfitName}, frontal, studio-Licht, neutraler Hintergrund, Fokus auf dem Accessoire` + : `Fotorealistisches Portrait von mir im Outfit ${outfitName}, natürliches Licht, neutraler Hintergrund`); + + const size: '1024x1024' | '1024x1536' = accessoryOnly ? '1024x1024' : '1024x1536'; + + // 5. Call the picture endpoint. + const res = await fetch(`${PICTURE_API_URL()}/api/v1/picture/generate-with-reference`, { + method: 'POST', + headers: { + 'content-type': 'application/json', + authorization: `Bearer ${ctx.jwt}`, + }, + body: JSON.stringify({ + prompt: effectivePrompt, + referenceMediaIds, + model: 'openai/gpt-image-2', + quality: input.quality, + size, + n: 1, + }), + }); + + if (!res.ok) { + const text = await res.text().catch(() => ''); + throw new Error( + `picture.generate-with-reference failed: ${res.status} ${res.statusText} — ${text.slice(0, 500)}` + ); + } + + const data = (await res.json()) as { + images?: Array<{ imageUrl: string; mediaId?: string }>; + imageUrl?: string; + mediaId?: string; + prompt: string; + model: string; + referenceMediaIds?: string[]; + }; + const first = + (data.images && data.images[0]) ?? + (data.imageUrl ? { imageUrl: data.imageUrl, mediaId: data.mediaId } : null); + if (!first?.imageUrl || !first.mediaId) { + throw new Error('picture endpoint returned no image'); + } + + ctx.logger.info('wardrobe.tryOn', { + outfitId: input.outfitId, + accessoryOnly, + refs: referenceMediaIds.length, + }); + + return { + imageUrl: first.imageUrl, + mediaId: first.mediaId, + prompt: data.prompt, + model: data.model, + referenceMediaIds: data.referenceMediaIds ?? referenceMediaIds, + mode: 'edit' as const, + }; + }, +}; + +// ─── Registration barrel ────────────────────────────────────────── + +export function registerWardrobeTools(): void { + registerTool(wardrobeListGarments); + registerTool(wardrobeListOutfits); + registerTool(wardrobeCreateOutfit); + registerTool(wardrobeTryOn); +} diff --git a/packages/mana-tool-registry/src/types.ts b/packages/mana-tool-registry/src/types.ts index a36b50029..fec4747e7 100644 --- a/packages/mana-tool-registry/src/types.ts +++ b/packages/mana-tool-registry/src/types.ts @@ -29,7 +29,9 @@ export type ModuleId = | 'tags' | 'mood' // — M5 (me-images + reference-based image generation) — - | 'me'; + | 'me' + // — Wardrobe M5 (garments + outfits + try-on) — + | 'wardrobe'; /** * `user-space` — operates on the caller's data within a specific Space. diff --git a/packages/shared-ai/src/index.ts b/packages/shared-ai/src/index.ts index e62d4aaff..6444f6c3f 100644 --- a/packages/shared-ai/src/index.ts +++ b/packages/shared-ai/src/index.ts @@ -90,14 +90,24 @@ export { DEFAULT_COMPACT_KEEP_RECENT, DEFAULT_COMPACT_MODEL, DEFAULT_COMPACT_THRESHOLD, + MAX_SUB_AGENT_DEPTH, MockLlmClient, parseCompactSummary, parsePlannerResponse, renderCompactSummary, runPlannerLoop, + runSubAgent, shouldCompact, + SubAgentRecursionError, +} from './planner'; +export type { + CompactHistoryOptions, + CompactHistoryResult, + CompactSummary, + RunSubAgentInput, + SubAgentResult, + SubAgentType, } from './planner'; -export type { CompactHistoryOptions, CompactHistoryResult, CompactSummary } from './planner'; export { AI_PROPOSABLE_TOOL_NAMES, diff --git a/packages/shared-ai/src/planner/index.ts b/packages/shared-ai/src/planner/index.ts index 8b36654c3..fe10a2b89 100644 --- a/packages/shared-ai/src/planner/index.ts +++ b/packages/shared-ai/src/planner/index.ts @@ -21,6 +21,8 @@ export { shouldCompact, } from './compact'; export type { CompactHistoryOptions, CompactHistoryResult, CompactSummary } from './compact'; +export { MAX_SUB_AGENT_DEPTH, SubAgentRecursionError, runSubAgent } from './sub-agent'; +export type { RunSubAgentInput, SubAgentResult, SubAgentType } from './sub-agent'; export { MockLlmClient } from './mock-llm'; export type { MockLlmTurn } from './mock-llm'; export type { diff --git a/packages/shared-ai/src/planner/sub-agent.test.ts b/packages/shared-ai/src/planner/sub-agent.test.ts new file mode 100644 index 000000000..570390d8a --- /dev/null +++ b/packages/shared-ai/src/planner/sub-agent.test.ts @@ -0,0 +1,288 @@ +import { describe, expect, it, vi } from 'vitest'; +import { + MAX_SUB_AGENT_DEPTH, + SubAgentRecursionError, + runSubAgent, + type SubAgentType, +} from './sub-agent'; +import { MockLlmClient } from './mock-llm'; +import type { ToolCallRequest, ToolResult } from './loop'; +import type { ToolSchema } from '../tools/schemas'; + +// ─── Fixtures ────────────────────────────────────────────────────── + +const tools: ToolSchema[] = [ + { + name: 'list_things', + module: 'test', + description: 'read-only listing', + defaultPolicy: 'auto', + parameters: [], + }, + { + name: 'get_thing', + module: 'test', + description: 'read one', + defaultPolicy: 'auto', + parameters: [{ name: 'id', type: 'string', description: 'id', required: true }], + }, + { + name: 'create_thing', + module: 'test', + description: 'writes', + defaultPolicy: 'propose', + parameters: [{ name: 'title', type: 'string', description: 'title', required: true }], + }, + { + name: 'delete_thing', + module: 'test', + description: 'destructive', + defaultPolicy: 'propose', + parameters: [{ name: 'id', type: 'string', description: 'id', required: true }], + }, +]; + +function baseInput(type: SubAgentType) { + return { + type, + task: 'Find all todo items that mention foo and summarise.', + parentTools: tools, + parentDepth: 0, + model: 'google/gemini-2.5-flash', + }; +} + +// ─── Recursion guard ─────────────────────────────────────────────── + +describe('runSubAgent — recursion guard', () => { + it('throws SubAgentRecursionError when parentDepth >= MAX_SUB_AGENT_DEPTH', async () => { + const llm = new MockLlmClient(); + await expect( + runSubAgent({ + ...baseInput('research'), + parentDepth: MAX_SUB_AGENT_DEPTH, + llm, + onToolCall: async () => ({ success: true, message: '' }), + }) + ).rejects.toBeInstanceOf(SubAgentRecursionError); + }); + + it('proceeds at parentDepth = 0', async () => { + const llm = new MockLlmClient().enqueueStop('ok'); + const res = await runSubAgent({ + ...baseInput('research'), + parentDepth: 0, + llm, + onToolCall: async () => ({ success: true, message: '' }), + }); + expect(res.summary).toBe('ok'); + }); +}); + +// ─── Tool filtering by type ──────────────────────────────────────── + +describe('runSubAgent — tool whitelisting', () => { + it('research type exposes only auto-policy tools to the LLM', async () => { + const llm = new MockLlmClient().enqueueStop('done'); + const res = await runSubAgent({ + ...baseInput('research'), + llm, + onToolCall: async () => ({ success: true, message: '' }), + }); + expect(res.availableToolCount).toBe(2); // list_things + get_thing + // The LLM saw the filtered toolset in its schema + const toolNames = llm.calls[0].toolNames; + expect(toolNames).toEqual(expect.arrayContaining(['list_things', 'get_thing'])); + expect(toolNames).not.toContain('create_thing'); + expect(toolNames).not.toContain('delete_thing'); + }); + + it('general type passes every tool through', async () => { + const llm = new MockLlmClient().enqueueStop('done'); + const res = await runSubAgent({ + ...baseInput('general'), + llm, + onToolCall: async () => ({ success: true, message: '' }), + }); + expect(res.availableToolCount).toBe(tools.length); + }); + + it('plan type also exposes read-only (same filter as research)', async () => { + const llm = new MockLlmClient().enqueueStop('done'); + const res = await runSubAgent({ + ...baseInput('plan'), + llm, + onToolCall: async () => ({ success: true, message: '' }), + }); + expect(res.availableToolCount).toBe(2); + }); + + it('custom toolFilter overrides the type default', async () => { + const llm = new MockLlmClient().enqueueStop('done'); + const res = await runSubAgent({ + ...baseInput('general'), + toolFilter: (t) => t.name === 'get_thing', + llm, + onToolCall: async () => ({ success: true, message: '' }), + }); + expect(res.availableToolCount).toBe(1); + }); + + it('belt-and-suspenders: rejects tool calls outside the whitelist', async () => { + // LLM (misbehaving) asks for create_thing inside a research agent + const llm = new MockLlmClient() + .enqueueToolCalls([{ name: 'create_thing', args: { title: 'nope' } }]) + .enqueueStop('fell back to a summary'); + + const dispatcherCalls: string[] = []; + const onToolCall = async (call: ToolCallRequest): Promise => { + dispatcherCalls.push(call.name); + return { success: true, message: 'should-not-be-called' }; + }; + + const res = await runSubAgent({ + ...baseInput('research'), + llm, + onToolCall, + }); + + // The caller's dispatcher was NEVER invoked — the wrapper rejected it. + expect(dispatcherCalls).toEqual([]); + + // The LLM received a failure tool-message so it can change course. + const secondCall = llm.calls[1].messages; + const toolMsg = secondCall[secondCall.length - 1]; + expect(toolMsg.role).toBe('tool'); + expect(toolMsg.content).toContain('nicht freigegeben'); + expect(res.summary).toBe('fell back to a summary'); + }); +}); + +// ─── Isolation (context-laundering) ──────────────────────────────── + +describe('runSubAgent — context isolation', () => { + it('starts with a fresh messages array — no parent context leaks in', async () => { + const llm = new MockLlmClient().enqueueStop('clean'); + await runSubAgent({ + ...baseInput('research'), + task: 'scan things', + llm, + onToolCall: async () => ({ success: true, message: '' }), + }); + + // What the LLM saw: [system, user] — no prior-messages leakage + const seen = llm.calls[0].messages; + expect(seen).toHaveLength(2); + expect(seen[0].role).toBe('system'); + expect(seen[0].content).toContain('Sub-Agent'); + expect(seen[1].role).toBe('user'); + expect(seen[1].content).toBe('scan things'); + }); + + it('exposes usage roll-up from the underlying loop', async () => { + const llm = new MockLlmClient(); + (llm as unknown as { queue: unknown[] }).queue.push({ + content: 'done', + toolCalls: [], + finishReason: 'stop', + usage: { promptTokens: 500, completionTokens: 120, totalTokens: 620 }, + }); + + const res = await runSubAgent({ + ...baseInput('research'), + llm, + onToolCall: async () => ({ success: true, message: '' }), + }); + expect(res.usage.promptTokens).toBe(500); + expect(res.usage.completionTokens).toBe(120); + expect(res.usage.totalTokens).toBe(620); + }); + + it('falls back to a default summary when the LLM hits maxRounds without stopping', async () => { + const llm = new MockLlmClient(); + for (let i = 0; i < 10; i++) { + llm.enqueueToolCalls([{ name: 'list_things', args: {} }]); + } + + const res = await runSubAgent({ + ...baseInput('research'), + maxRounds: 3, + llm, + onToolCall: async () => ({ success: true, message: 'ok' }), + }); + + expect(res.rawResult.stopReason).toBe('max-rounds'); + expect(res.summary).toContain('3 Runden ohne Summary'); + }); +}); + +// ─── System prompt customisation ────────────────────────────────── + +describe('runSubAgent — system prompt', () => { + it('uses a type-specific default prompt', async () => { + const llm = new MockLlmClient().enqueueStop('done'); + await runSubAgent({ + ...baseInput('research'), + llm, + onToolCall: async () => ({ success: true, message: '' }), + }); + const seen = llm.calls[0].messages; + expect(seen[0].content).toContain('research'); + }); + + it('honours an explicit systemPrompt override', async () => { + const llm = new MockLlmClient().enqueueStop('done'); + await runSubAgent({ + ...baseInput('general'), + systemPrompt: 'CUSTOM SYSTEM: do exactly X.', + llm, + onToolCall: async () => ({ success: true, message: '' }), + }); + const seen = llm.calls[0].messages; + expect(seen[0].content).toBe('CUSTOM SYSTEM: do exactly X.'); + }); +}); + +// ─── Model contract ──────────────────────────────────────────────── + +describe('runSubAgent — model routing', () => { + it('throws when no model is supplied', async () => { + const llm = new MockLlmClient(); + await expect( + runSubAgent({ + ...baseInput('research'), + model: undefined, + llm, + onToolCall: async () => ({ success: true, message: '' }), + }) + ).rejects.toThrow(/no model supplied/); + }); +}); + +// ─── End-to-end: tool executed + summary returned ────────────────── + +describe('runSubAgent — end-to-end', () => { + it('loops: tool call → result → summary', async () => { + const llm = new MockLlmClient() + .enqueueToolCalls([{ name: 'list_things', args: {} }]) + .enqueueStop('Found 3 things: a, b, c'); + + const onToolCall = vi.fn( + async (_call: ToolCallRequest): Promise => ({ + success: true, + data: ['a', 'b', 'c'], + message: '3 items', + }) + ); + + const res = await runSubAgent({ + ...baseInput('research'), + llm, + onToolCall, + }); + + expect(onToolCall).toHaveBeenCalledTimes(1); + expect(res.summary).toBe('Found 3 things: a, b, c'); + expect(res.rawResult.executedCalls).toHaveLength(1); + }); +}); diff --git a/packages/shared-ai/src/planner/sub-agent.ts b/packages/shared-ai/src/planner/sub-agent.ts new file mode 100644 index 000000000..4237ac059 --- /dev/null +++ b/packages/shared-ai/src/planner/sub-agent.ts @@ -0,0 +1,259 @@ +/** + * In-process sub-agent loop — the `I2A` pattern from Claude Code. + * + * A sub-agent is `runPlannerLoop` run with four invariants flipped: + * + * 1. FRESH `messages[]` — the parent's history never leaks into the + * sub-agent. The sub-agent only sees its own system prompt + task + * description. This is the "context-laundering" point: hundreds + * of scanned files, retry loops, or noisy tool results stay + * inside the sub-agent and never pollute the parent log. + * + * 2. RESTRICTED tool-whitelist — the parent's full tool-set is + * filtered down to a subset appropriate for the sub-agent's type + * (e.g. `research` gets read-only tools, `general` gets whatever + * the parent had). The whitelist is enforced at THIS layer, not + * left to the LLM to "please don't use write tools". + * + * 3. SINGLE RETURN VALUE — the sub-agent loop produces one string + * summary back to the parent (rendered as the parent's `task` + * tool-result). The parent NEVER sees the sub-agent's individual + * tool calls. This is the Claude-Code contract and matches the + * original paper's sub-episode recipe from RL. + * + * 4. ONE LEVEL DEEP, STRICT — a sub-agent cannot launch another + * sub-agent. `parentDepth` in the input enforces this; the + * consumer-level `task` tool handler is the other guard. + * + * Token usage from the sub-agent rolls up to the caller (returned as + * part of `SubAgentResult.usage`) so budget tracking in mana-ai's + * agent snapshots sees the full sub-tree cost, not just the parent loop. + */ + +import type { ToolSchema } from '../tools/schemas'; +import { runPlannerLoop } from './loop'; +import type { + LlmClient, + PlannerLoopResult, + ReminderChannel, + TokenUsage, + ToolCallRequest, + ToolResult, +} from './loop'; + +/** + * Named sub-agent archetypes. Each type declares a default tool-filter + * predicate that the launcher uses to carve the allowed tool-set out of + * the parent's full catalog. + * + * - `research`: read-only. LLM may list/get/search but not mutate. + * Default for "go scan these things and tell me what's + * there" tasks. Matches Claude Code's `Explore` agent. + * + * - `general`: anything the parent could do (minus recursion). For + * heterogeneous tasks where the sub-agent may need + * writes. Equivalent to Claude Code's `general-purpose`. + * + * - `plan`: read-only, small round budget. For "think through + * this before acting" where the summary IS the value. + * Matches Claude Code's `Plan` mode. + * + * Consumers can supply a custom `toolFilter` to override these defaults. + */ +export type SubAgentType = 'research' | 'general' | 'plan'; + +const DEFAULT_TOOL_FILTERS: Record boolean> = { + research: (t) => t.defaultPolicy === 'auto', + general: () => true, + plan: (t) => t.defaultPolicy === 'auto', +}; + +const DEFAULT_MAX_ROUNDS: Record = { + research: 5, + general: 5, + plan: 3, +}; + +/** + * Hard cap on recursion — one level deep, period. Matches Claude Code's + * `KN5` launcher behaviour. + */ +export const MAX_SUB_AGENT_DEPTH = 1; + +export interface RunSubAgentInput { + /** LLM transport. Typically the same client as the parent; can be + * swapped for a cheaper-tier model for research-type sub-agents. */ + readonly llm: LlmClient; + /** Model id — in `provider/model` form. If omitted, the sub-agent + * falls back to the parent-supplied `model`. */ + readonly model?: string; + /** Archetype — see `SubAgentType` docs. */ + readonly type: SubAgentType; + /** Free-text task description the parent wants the sub-agent to + * execute. Becomes the sub-agent's `userPrompt`. */ + readonly task: string; + /** Parent's full tool catalog. The launcher applies the type's + * filter (or the caller's override) to produce the sub-agent's + * restricted set. */ + readonly parentTools: readonly ToolSchema[]; + /** Optional tool-filter override. Takes precedence over the + * type's default predicate. */ + readonly toolFilter?: (tool: ToolSchema) => boolean; + /** Tool dispatcher. Receives the sub-agent's tool calls — NOT the + * parent's. The dispatcher MUST validate against the restricted + * whitelist too (belt-and-suspenders); otherwise a malformed + * LLM response could invoke a filtered-out tool. */ + readonly onToolCall: (call: ToolCallRequest) => Promise; + /** Current recursion depth. Parent callers pass 0. A sub-agent + * spawning ANOTHER sub-agent must pass 1, which this function + * then rejects. */ + readonly parentDepth: number; + /** Optional per-round reminder channel for the sub-agent. Typically + * different from the parent's — e.g. a "you are a research + * sub-agent, don't write" nudge instead of the parent's budget + * warnings. */ + readonly reminderChannel?: ReminderChannel; + /** Max LLM rounds inside this sub-agent. Defaults to the type's + * value (research: 5, general: 5, plan: 3). */ + readonly maxRounds?: number; + /** Explicit system prompt. Defaults to a short generic "you are a + * sub-agent, return a summary" prompt matching the type. */ + readonly systemPrompt?: string; +} + +export interface SubAgentResult { + readonly type: SubAgentType; + /** Single-string digest the parent sees as `ToolResult.message`. + * Falls back to a generic line when the LLM hit the round budget + * without producing assistant text. */ + readonly summary: string; + /** Raw planner result for debug capture. Consumers typically do NOT + * forward this to the parent — only the summary crosses the + * boundary. Kept here so a debug log can record the full + * sub-episode if the caller wants to. */ + readonly rawResult: PlannerLoopResult; + /** Rolled-up usage so the caller can attribute tokens to the + * parent's mission/agent budget. */ + readonly usage: TokenUsage; + /** How many restricted tools the sub-agent ultimately got. Useful + * for debug logs and dashboards; if it drops to 0 the filter was + * too aggressive and the sub-agent probably couldn't do anything. */ + readonly availableToolCount: number; +} + +/** + * Thrown when a sub-agent tries to spawn another sub-agent. Callers + * at the tool-registry `task` handler layer also check this, but the + * primitive throws as a defense-in-depth signal the consumer handler + * shouldn't swallow silently. + */ +export class SubAgentRecursionError extends Error { + constructor(depth: number) { + super( + `Sub-agents are one-level-deep only; caller passed parentDepth=${depth}. ` + + `MAX_SUB_AGENT_DEPTH=${MAX_SUB_AGENT_DEPTH}.` + ); + this.name = 'SubAgentRecursionError'; + } +} + +function defaultSystemPrompt(type: SubAgentType): string { + const base = + 'Du bist ein Sub-Agent. Fuehre genau die Aufgabe aus, die dir der Parent ' + + 'Agent gibt, und liefere eine knappe Summary am Ende. Keine Seitendiskussion.'; + if (type === 'research') { + return ( + base + + '\n\nArchetyp: research. Du darfst nur Lese-Tools verwenden. Schreibe ' + + 'nichts. Ergebnis = Summary deiner Funde in maximal 10 Zeilen.' + ); + } + if (type === 'plan') { + return ( + base + + '\n\nArchetyp: plan. Keine Tool-Calls wenn moeglich. Denke durch die ' + + 'Aufgabe und formuliere einen strukturierten Plan (3-5 Schritte) als ' + + 'Summary.' + ); + } + return ( + base + + '\n\nArchetyp: general. Nutze Tools wie ein Parent-Agent es tun wuerde, ' + + 'aber halte die Summary auf das Wesentliche beschraenkt.' + ); +} + +/** + * Launch an in-process sub-agent. See module docstring for the four + * invariants this enforces. + * + * The returned `summary` is the single artifact that should cross back + * to the parent (typically as a `task` tool-result message). Everything + * else (`rawResult`, individual tool calls) is kept for the caller's + * own debug log but NEVER rendered into the parent's messages array. + */ +export async function runSubAgent(input: RunSubAgentInput): Promise { + if (input.parentDepth >= MAX_SUB_AGENT_DEPTH) { + throw new SubAgentRecursionError(input.parentDepth); + } + + const filter = input.toolFilter ?? DEFAULT_TOOL_FILTERS[input.type]; + const restrictedTools = input.parentTools.filter(filter); + const maxRounds = input.maxRounds ?? DEFAULT_MAX_ROUNDS[input.type]; + const systemPrompt = input.systemPrompt ?? defaultSystemPrompt(input.type); + const model = input.model ?? ''; + + // The loop requires a model string; surface a clear error rather + // than letting the LLM client fail with a cryptic provider error. + if (!model) { + throw new Error( + `runSubAgent: no model supplied. Pass opts.model explicitly — sub-agents ` + + `default to nothing on purpose so routing to a cheaper tier (Haiku) is ` + + `an explicit decision by the caller.` + ); + } + + const rawResult = await runPlannerLoop({ + llm: input.llm, + input: { + systemPrompt, + userPrompt: input.task, + tools: restrictedTools, + model, + maxRounds, + reminderChannel: input.reminderChannel, + // No compactor for sub-agents: they are short-lived by + // construction (maxRounds ≤ 5). If the caller needs a + // deeper sub-agent, lift that decision up — don't double + // the LLM call count inside a disposable context. + }, + onToolCall: async (call: ToolCallRequest): Promise => { + // Belt-and-suspenders: even though `restrictedTools` was + // passed to the loop, a buggy LLM response could still + // name a tool outside the whitelist. Reject it here so the + // caller's dispatcher never runs an unauthorised tool. + const isWhitelisted = restrictedTools.some((t) => t.name === call.name); + if (!isWhitelisted) { + return { + success: false, + message: + `Tool ${call.name} ist fuer diesen Sub-Agent (${input.type}) ` + + `nicht freigegeben. Wechsel die Strategie oder brich ab.`, + }; + } + return input.onToolCall(call); + }, + }); + + const summary = + rawResult.summary ?? + `(Sub-Agent ${input.type} beendet nach ${rawResult.rounds} Runden ohne Summary.)`; + + return { + type: input.type, + summary, + rawResult, + usage: rawResult.usage, + availableToolCount: restrictedTools.length, + }; +}