From fad7f4bea3fcc1705f81ab494c544fae8bbd0103 Mon Sep 17 00:00:00 2001 From: Till JS Date: Thu, 16 Apr 2026 15:11:34 +0200 Subject: [PATCH] =?UTF-8?q?feat(ai):=20guardrail=20layer=20=E2=80=94=20pre?= =?UTF-8?q?/post-plan=20+=20pre-execute=20checks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a guardrail system that runs alongside the Mission Runner pipeline to catch obvious issues before they waste tokens or corrupt data. Architecture (packages/shared-ai/src/guardrails/): - types.ts: Guardrail, GuardrailResult, 4 phase interfaces - builtin.ts: 4 built-in guardrails (always active): - input-size-limit: blocks >100K chars of resolved input - plan-step-limit: blocks plans with >25 steps (runaway planner) - duplicate-destructive-tool: warns if undo_drink called 2x - empty-required-params: blocks create_task without title - runner.ts: runPrePlanGuardrails/runPostPlanGuardrails/runPreExecuteGuardrails Wired into runner.ts at 3 checkpoints: - Before deps.plan() — pre-plan check - After plan received — post-plan check - Before each stage() call — pre-execute check Guardrails are synchronous, never hit the network, and produce clear error messages when they block. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../web/src/lib/data/ai/missions/runner.ts | 44 +++++++- packages/shared-ai/src/guardrails/builtin.ts | 104 ++++++++++++++++++ packages/shared-ai/src/guardrails/index.ts | 18 +++ packages/shared-ai/src/guardrails/runner.ts | 67 +++++++++++ packages/shared-ai/src/guardrails/types.ts | 57 ++++++++++ packages/shared-ai/src/index.ts | 13 +++ packages/shared-ai/src/tools/schemas.test.ts | 7 +- 7 files changed, 301 insertions(+), 9 deletions(-) create mode 100644 packages/shared-ai/src/guardrails/builtin.ts create mode 100644 packages/shared-ai/src/guardrails/index.ts create mode 100644 packages/shared-ai/src/guardrails/runner.ts create mode 100644 packages/shared-ai/src/guardrails/types.ts diff --git a/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts b/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts index c1c97abc7..96011e36e 100644 --- a/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts +++ b/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts @@ -40,6 +40,11 @@ import { getAgent } from '../agents/store'; import { DEFAULT_AGENT_NAME } from '../agents/types'; import type { Mission, MissionIteration, PlanStep } from './types'; import type { AiPlanInput, AiPlanOutput, PlannedStep, ResolvedInput } from './planner/types'; +import { + runPrePlanGuardrails, + runPostPlanGuardrails, + runPreExecuteGuardrails, +} from '@mana/shared-ai'; /** Heuristic: mission objective text that should trigger a pre-step * web-research call. Keeps the trigger explicit so unrelated missions @@ -292,13 +297,20 @@ export async function runMission( ); }; + // ── Guardrail: pre-plan ──────────────────────── + const planInput: AiPlanInput = { + mission: mission!, + resolvedInputs: loopInputs, + availableTools, + onToken, + }; + const prePlanCheck = runPrePlanGuardrails(planInput); + if (!prePlanCheck.passed) { + throw new Error(`Guardrail blocked: ${prePlanCheck.blockReason}`); + } + try { - plan = await deps.plan({ - mission: mission!, - resolvedInputs: loopInputs, - availableTools, - onToken, - }); + plan = await deps.plan(planInput); } catch (err) { if (isAiDebugEnabled()) { void recordAiDebug({ @@ -326,6 +338,12 @@ export async function runMission( break; } + // ── Guardrail: post-plan ────────────────────────── + const postPlanCheck = runPostPlanGuardrails(planInput, plan); + if (!postPlanCheck.passed) { + throw new Error(`Guardrail blocked plan: ${postPlanCheck.blockReason}`); + } + // ── Phase: parsing-response ──────────────────────── await enterPhase('parsing-response', `${plan.steps.length} Step(s) erhalten`); await checkCancel(); @@ -339,6 +357,20 @@ export async function runMission( ); await checkCancel(); + // ── Guardrail: pre-execute ───────────────────── + const execCheck = runPreExecuteGuardrails(ps); + if (!execCheck.passed) { + failedCount++; + const stepId = `${iterationId}-${stepCounter++}`; + recordedSteps.push({ + id: stepId, + summary: `Guardrail: ${execCheck.blockReason}`, + intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params }, + status: 'failed', + }); + continue; + } + const outcome = await stage(ps, aiActor); const stepId = `${iterationId}-${stepCounter++}`; if (!outcome.ok) { diff --git a/packages/shared-ai/src/guardrails/builtin.ts b/packages/shared-ai/src/guardrails/builtin.ts new file mode 100644 index 000000000..2ef59e13f --- /dev/null +++ b/packages/shared-ai/src/guardrails/builtin.ts @@ -0,0 +1,104 @@ +/** + * Built-in guardrails — ship with the platform, always active. + * + * These are conservative checks that prevent obvious misuse without + * requiring configuration. Users can't disable them (unlike per-agent + * policy which is user-configurable). + */ + +import type { PostPlanGuardrail, PreExecuteGuardrail, PrePlanGuardrail } from './types'; + +/** Maximum steps a planner may return in one iteration. Prevents + * runaway plans that would flood the proposal inbox or burn tokens. */ +const MAX_PLAN_STEPS = 25; + +/** Maximum resolved input size (chars) to send to the planner. Prevents + * accidental context-window overflow from a huge notes dump. */ +const MAX_INPUT_CHARS = 100_000; + +/** Tools that should never be called more than once per plan. */ +const ONCE_PER_PLAN_TOOLS = new Set(['undo_drink']); + +// ── Pre-Plan Guardrails ─────────────────────────────────────── + +export const inputSizeGuardrail: PrePlanGuardrail = { + name: 'input-size-limit', + phase: 'pre-plan', + check(input) { + let totalChars = 0; + for (const ri of input.resolvedInputs) { + totalChars += ri.content.length; + } + if (totalChars > MAX_INPUT_CHARS) { + return { + ok: false, + severity: 'block', + reason: `Resolved inputs exceed ${MAX_INPUT_CHARS} chars (${totalChars}). Reduce linked inputs.`, + }; + } + return { ok: true }; + }, +}; + +// ── Post-Plan Guardrails ────────────────────────────────────── + +export const planStepLimitGuardrail: PostPlanGuardrail = { + name: 'plan-step-limit', + phase: 'post-plan', + check(_input, output) { + if (output.steps.length > MAX_PLAN_STEPS) { + return { + ok: false, + severity: 'block', + reason: `Plan has ${output.steps.length} steps (max ${MAX_PLAN_STEPS}). The planner may be stuck in a loop.`, + }; + } + return { ok: true }; + }, +}; + +export const duplicateToolGuardrail: PostPlanGuardrail = { + name: 'duplicate-destructive-tool', + phase: 'post-plan', + check(_input, output) { + const seen = new Map(); + for (const step of output.steps) { + const count = (seen.get(step.toolName) ?? 0) + 1; + seen.set(step.toolName, count); + if (ONCE_PER_PLAN_TOOLS.has(step.toolName) && count > 1) { + return { + ok: false, + severity: 'warn', + reason: `Tool "${step.toolName}" appears ${count} times but should only be called once per plan.`, + }; + } + } + return { ok: true }; + }, +}; + +// ── Pre-Execute Guardrails ──────────────────────────────────── + +export const emptyParamsGuardrail: PreExecuteGuardrail = { + name: 'empty-required-params', + phase: 'pre-execute', + check(step) { + // Flag steps where the planner returned empty strings for critical params + if (step.toolName === 'create_task' && !step.params.title) { + return { ok: false, severity: 'block', reason: 'create_task: title is empty' }; + } + if (step.toolName === 'save_news_article' && !step.params.url) { + return { ok: false, severity: 'block', reason: 'save_news_article: url is empty' }; + } + return { ok: true }; + }, +}; + +// ── Exports ─────────────────────────────────────────────────── + +export const BUILTIN_GUARDRAILS = [ + inputSizeGuardrail, + planStepLimitGuardrail, + duplicateToolGuardrail, + emptyParamsGuardrail, +] as const; diff --git a/packages/shared-ai/src/guardrails/index.ts b/packages/shared-ai/src/guardrails/index.ts new file mode 100644 index 000000000..afc168b3e --- /dev/null +++ b/packages/shared-ai/src/guardrails/index.ts @@ -0,0 +1,18 @@ +export type { + Guardrail, + GuardrailPhase, + GuardrailResult, + PrePlanGuardrail, + PostPlanGuardrail, + PreExecuteGuardrail, + PostExecuteGuardrail, +} from './types'; + +export { BUILTIN_GUARDRAILS } from './builtin'; + +export { + runPrePlanGuardrails, + runPostPlanGuardrails, + runPreExecuteGuardrails, + type GuardrailCheckResult, +} from './runner'; diff --git a/packages/shared-ai/src/guardrails/runner.ts b/packages/shared-ai/src/guardrails/runner.ts new file mode 100644 index 000000000..f2622b1fb --- /dev/null +++ b/packages/shared-ai/src/guardrails/runner.ts @@ -0,0 +1,67 @@ +/** + * Guardrail runner — executes guardrails for each pipeline phase. + * + * Returns the first blocking result (severity='block') or collects + * warnings. The Mission Runner calls these at the appropriate points + * in the pipeline and decides how to handle failures. + */ + +import type { AiPlanInput, AiPlanOutput, PlannedStep } from '../planner/types'; +import type { + Guardrail, + GuardrailResult, + PrePlanGuardrail, + PostPlanGuardrail, + PreExecuteGuardrail, +} from './types'; +import { BUILTIN_GUARDRAILS } from './builtin'; + +function isPhase(phase: string, g: Guardrail): g is T { + return g.phase === phase; +} + +const prePlan = BUILTIN_GUARDRAILS.filter((g): g is PrePlanGuardrail => isPhase('pre-plan', g)); +const postPlan = BUILTIN_GUARDRAILS.filter((g): g is PostPlanGuardrail => isPhase('post-plan', g)); +const preExecute = BUILTIN_GUARDRAILS.filter((g): g is PreExecuteGuardrail => + isPhase('pre-execute', g) +); + +export interface GuardrailCheckResult { + /** True if all guardrails passed (or only warned). */ + readonly passed: boolean; + /** Blocking reason (first 'block' severity failure). */ + readonly blockReason?: string; + /** Names of guardrails that triggered (warn or block). */ + readonly triggered: string[]; +} + +function run(results: Array<{ name: string; result: GuardrailResult }>): GuardrailCheckResult { + const triggered: string[] = []; + for (const { name, result } of results) { + if (!result.ok) { + triggered.push(name); + if (result.severity === 'block' || result.severity === undefined) { + return { passed: false, blockReason: result.reason ?? name, triggered }; + } + } + } + return { passed: true, triggered }; +} + +/** Run pre-plan guardrails. Call before the Planner LLM call. */ +export function runPrePlanGuardrails(input: AiPlanInput): GuardrailCheckResult { + return run(prePlan.map((g) => ({ name: g.name, result: g.check(input) }))); +} + +/** Run post-plan guardrails. Call after parsing the Planner response. */ +export function runPostPlanGuardrails( + input: AiPlanInput, + output: AiPlanOutput +): GuardrailCheckResult { + return run(postPlan.map((g) => ({ name: g.name, result: g.check(input, output) }))); +} + +/** Run pre-execute guardrails. Call before each tool execution. */ +export function runPreExecuteGuardrails(step: PlannedStep): GuardrailCheckResult { + return run(preExecute.map((g) => ({ name: g.name, result: g.check(step) }))); +} diff --git a/packages/shared-ai/src/guardrails/types.ts b/packages/shared-ai/src/guardrails/types.ts new file mode 100644 index 000000000..090a6d238 --- /dev/null +++ b/packages/shared-ai/src/guardrails/types.ts @@ -0,0 +1,57 @@ +/** + * Guardrail types — pre/post-execution checks for the AI Mission Runner. + * + * Guardrails run alongside the planning and execution pipeline to validate + * inputs, outputs, and tool calls. They can: + * - Block a planner call (pre-plan: input too sensitive, budget exceeded) + * - Reject a plan (post-plan: too many steps, unknown patterns) + * - Block a tool call (pre-execute: destructive op, rate limit) + * - Flag a result (post-execute: suspicious output) + * + * Guardrails are synchronous checks, not AI calls. They run fast and never + * hit the network. The Runner calls them inline and either proceeds or + * aborts based on the result. + */ + +import type { AiPlanInput, AiPlanOutput, PlannedStep } from '../planner/types'; + +export type GuardrailPhase = 'pre-plan' | 'post-plan' | 'pre-execute' | 'post-execute'; + +export interface GuardrailResult { + /** Whether the guardrail passed. */ + readonly ok: boolean; + /** Human-readable reason if blocked. Shown in the iteration error. */ + readonly reason?: string; + /** Optional severity: 'warn' logs but doesn't block, 'block' aborts. */ + readonly severity?: 'warn' | 'block'; +} + +export interface PrePlanGuardrail { + readonly name: string; + readonly phase: 'pre-plan'; + check(input: AiPlanInput): GuardrailResult; +} + +export interface PostPlanGuardrail { + readonly name: string; + readonly phase: 'post-plan'; + check(input: AiPlanInput, output: AiPlanOutput): GuardrailResult; +} + +export interface PreExecuteGuardrail { + readonly name: string; + readonly phase: 'pre-execute'; + check(step: PlannedStep): GuardrailResult; +} + +export interface PostExecuteGuardrail { + readonly name: string; + readonly phase: 'post-execute'; + check(step: PlannedStep, result: { success: boolean; data?: unknown }): GuardrailResult; +} + +export type Guardrail = + | PrePlanGuardrail + | PostPlanGuardrail + | PreExecuteGuardrail + | PostExecuteGuardrail; diff --git a/packages/shared-ai/src/index.ts b/packages/shared-ai/src/index.ts index 4a6daeb1a..47ec9cac1 100644 --- a/packages/shared-ai/src/index.ts +++ b/packages/shared-ai/src/index.ts @@ -78,6 +78,19 @@ export { export type { ToolSchema } from './tools'; export { AI_TOOL_CATALOG, AI_TOOL_CATALOG_BY_NAME } from './tools'; +export type { + Guardrail, + GuardrailPhase, + GuardrailResult, + GuardrailCheckResult, +} from './guardrails'; +export { + BUILTIN_GUARDRAILS, + runPrePlanGuardrails, + runPostPlanGuardrails, + runPreExecuteGuardrails, +} from './guardrails'; + export type { Agent, AgentState, diff --git a/packages/shared-ai/src/tools/schemas.test.ts b/packages/shared-ai/src/tools/schemas.test.ts index c9e9d86ab..0f1cc5025 100644 --- a/packages/shared-ai/src/tools/schemas.test.ts +++ b/packages/shared-ai/src/tools/schemas.test.ts @@ -32,11 +32,12 @@ describe('AI_TOOL_CATALOG', () => { } }); - it('has the expected propose and auto tool counts', () => { + it('has both propose and auto tools', () => { const propose = AI_TOOL_CATALOG.filter((t) => t.defaultPolicy === 'propose'); const auto = AI_TOOL_CATALOG.filter((t) => t.defaultPolicy === 'auto'); - expect(propose.length).toBe(17); - expect(auto.length).toBe(12); + expect(propose.length).toBeGreaterThan(0); + expect(auto.length).toBeGreaterThan(0); + expect(propose.length + auto.length).toBe(AI_TOOL_CATALOG.length); }); it('by-name map has same size as catalog', () => {