feat(ai): guardrail layer — pre/post-plan + pre-execute checks

Add a guardrail system that runs alongside the Mission Runner pipeline to catch obvious issues before they waste tokens or corrupt data. Architecture (packages/shared-ai/src/guardrails/): - types.ts: Guardrail, GuardrailResult, 4 phase interfaces - builtin.ts: 4 built-in guardrails (always active): - input-size-limit: blocks >100K chars of resolved input - plan-step-limit: blocks plans with >25 steps (runaway planner) - duplicate-destructive-tool: warns if undo_drink called 2x - empty-required-params: blocks create_task without title - runner.ts: runPrePlanGuardrails/runPostPlanGuardrails/runPreExecuteGuardrails Wired into runner.ts at 3 checkpoints: - Before deps.plan() — pre-plan check - After plan received — post-plan check - Before each stage() call — pre-execute check Guardrails are synchronous, never hit the network, and produce clear error messages when they block. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 19:01:08 +02:00 · 2026-04-16 15:11:34 +02:00 · 2026-04-16 15:11:34 +02:00 · fad7f4bea3
commit fad7f4bea3
parent f5392b8b63
7 changed files with 301 additions and 9 deletions
--- a/packages/shared-ai/src/guardrails/builtin.ts
+++ b/packages/shared-ai/src/guardrails/builtin.ts
@ -0,0 +1,104 @@
+/**
+ * Built-in guardrails — ship with the platform, always active.
+ *
+ * These are conservative checks that prevent obvious misuse without
+ * requiring configuration. Users can't disable them (unlike per-agent
+ * policy which is user-configurable).
+ */
+
+import type { PostPlanGuardrail, PreExecuteGuardrail, PrePlanGuardrail } from './types';
+
+/** Maximum steps a planner may return in one iteration. Prevents
+ *  runaway plans that would flood the proposal inbox or burn tokens. */
+const MAX_PLAN_STEPS = 25;
+
+/** Maximum resolved input size (chars) to send to the planner. Prevents
+ *  accidental context-window overflow from a huge notes dump. */
+const MAX_INPUT_CHARS = 100_000;
+
+/** Tools that should never be called more than once per plan. */
+const ONCE_PER_PLAN_TOOLS = new Set(['undo_drink']);
+
+// ── Pre-Plan Guardrails ───────────────────────────────────────
+
+export const inputSizeGuardrail: PrePlanGuardrail = {
+	name: 'input-size-limit',
+	phase: 'pre-plan',
+	check(input) {
+		let totalChars = 0;
+		for (const ri of input.resolvedInputs) {
+			totalChars += ri.content.length;
+		}
+		if (totalChars > MAX_INPUT_CHARS) {
+			return {
+				ok: false,
+				severity: 'block',
+				reason: `Resolved inputs exceed ${MAX_INPUT_CHARS} chars (${totalChars}). Reduce linked inputs.`,
+			};
+		}
+		return { ok: true };
+	},
+};
+
+// ── Post-Plan Guardrails ──────────────────────────────────────
+
+export const planStepLimitGuardrail: PostPlanGuardrail = {
+	name: 'plan-step-limit',
+	phase: 'post-plan',
+	check(_input, output) {
+		if (output.steps.length > MAX_PLAN_STEPS) {
+			return {
+				ok: false,
+				severity: 'block',
+				reason: `Plan has ${output.steps.length} steps (max ${MAX_PLAN_STEPS}). The planner may be stuck in a loop.`,
+			};
+		}
+		return { ok: true };
+	},
+};
+
+export const duplicateToolGuardrail: PostPlanGuardrail = {
+	name: 'duplicate-destructive-tool',
+	phase: 'post-plan',
+	check(_input, output) {
+		const seen = new Map<string, number>();
+		for (const step of output.steps) {
+			const count = (seen.get(step.toolName) ?? 0) + 1;
+			seen.set(step.toolName, count);
+			if (ONCE_PER_PLAN_TOOLS.has(step.toolName) && count > 1) {
+				return {
+					ok: false,
+					severity: 'warn',
+					reason: `Tool "${step.toolName}" appears ${count} times but should only be called once per plan.`,
+				};
+			}
+		}
+		return { ok: true };
+	},
+};
+
+// ── Pre-Execute Guardrails ────────────────────────────────────
+
+export const emptyParamsGuardrail: PreExecuteGuardrail = {
+	name: 'empty-required-params',
+	phase: 'pre-execute',
+	check(step) {
+		// Flag steps where the planner returned empty strings for critical params
+		if (step.toolName === 'create_task' && !step.params.title) {
+			return { ok: false, severity: 'block', reason: 'create_task: title is empty' };
+		}
+		if (step.toolName === 'save_news_article' && !step.params.url) {
+			return { ok: false, severity: 'block', reason: 'save_news_article: url is empty' };
+		}
+		return { ok: true };
+	},
+};
+
+// ── Exports ───────────────────────────────────────────────────
+
+export const BUILTIN_GUARDRAILS = [
+	inputSizeGuardrail,
+	planStepLimitGuardrail,
+	duplicateToolGuardrail,
+	emptyParamsGuardrail,
+] as const;
--- a/packages/shared-ai/src/guardrails/index.ts
+++ b/packages/shared-ai/src/guardrails/index.ts
@ -0,0 +1,18 @@
+export type {
+	Guardrail,
+	GuardrailPhase,
+	GuardrailResult,
+	PrePlanGuardrail,
+	PostPlanGuardrail,
+	PreExecuteGuardrail,
+	PostExecuteGuardrail,
+} from './types';
+
+export { BUILTIN_GUARDRAILS } from './builtin';
+
+export {
+	runPrePlanGuardrails,
+	runPostPlanGuardrails,
+	runPreExecuteGuardrails,
+	type GuardrailCheckResult,
+} from './runner';
--- a/packages/shared-ai/src/guardrails/runner.ts
+++ b/packages/shared-ai/src/guardrails/runner.ts
@ -0,0 +1,67 @@
+/**
+ * Guardrail runner — executes guardrails for each pipeline phase.
+ *
+ * Returns the first blocking result (severity='block') or collects
+ * warnings. The Mission Runner calls these at the appropriate points
+ * in the pipeline and decides how to handle failures.
+ */
+
+import type { AiPlanInput, AiPlanOutput, PlannedStep } from '../planner/types';
+import type {
+	Guardrail,
+	GuardrailResult,
+	PrePlanGuardrail,
+	PostPlanGuardrail,
+	PreExecuteGuardrail,
+} from './types';
+import { BUILTIN_GUARDRAILS } from './builtin';
+
+function isPhase<T extends Guardrail>(phase: string, g: Guardrail): g is T {
+	return g.phase === phase;
+}
+
+const prePlan = BUILTIN_GUARDRAILS.filter((g): g is PrePlanGuardrail => isPhase('pre-plan', g));
+const postPlan = BUILTIN_GUARDRAILS.filter((g): g is PostPlanGuardrail => isPhase('post-plan', g));
+const preExecute = BUILTIN_GUARDRAILS.filter((g): g is PreExecuteGuardrail =>
+	isPhase('pre-execute', g)
+);
+
+export interface GuardrailCheckResult {
+	/** True if all guardrails passed (or only warned). */
+	readonly passed: boolean;
+	/** Blocking reason (first 'block' severity failure). */
+	readonly blockReason?: string;
+	/** Names of guardrails that triggered (warn or block). */
+	readonly triggered: string[];
+}
+
+function run(results: Array<{ name: string; result: GuardrailResult }>): GuardrailCheckResult {
+	const triggered: string[] = [];
+	for (const { name, result } of results) {
+		if (!result.ok) {
+			triggered.push(name);
+			if (result.severity === 'block' || result.severity === undefined) {
+				return { passed: false, blockReason: result.reason ?? name, triggered };
+			}
+		}
+	}
+	return { passed: true, triggered };
+}
+
+/** Run pre-plan guardrails. Call before the Planner LLM call. */
+export function runPrePlanGuardrails(input: AiPlanInput): GuardrailCheckResult {
+	return run(prePlan.map((g) => ({ name: g.name, result: g.check(input) })));
+}
+
+/** Run post-plan guardrails. Call after parsing the Planner response. */
+export function runPostPlanGuardrails(
+	input: AiPlanInput,
+	output: AiPlanOutput
+): GuardrailCheckResult {
+	return run(postPlan.map((g) => ({ name: g.name, result: g.check(input, output) })));
+}
+
+/** Run pre-execute guardrails. Call before each tool execution. */
+export function runPreExecuteGuardrails(step: PlannedStep): GuardrailCheckResult {
+	return run(preExecute.map((g) => ({ name: g.name, result: g.check(step) })));
+}
--- a/packages/shared-ai/src/guardrails/types.ts
+++ b/packages/shared-ai/src/guardrails/types.ts
@ -0,0 +1,57 @@
+/**
+ * Guardrail types — pre/post-execution checks for the AI Mission Runner.
+ *
+ * Guardrails run alongside the planning and execution pipeline to validate
+ * inputs, outputs, and tool calls. They can:
+ *   - Block a planner call (pre-plan: input too sensitive, budget exceeded)
+ *   - Reject a plan (post-plan: too many steps, unknown patterns)
+ *   - Block a tool call (pre-execute: destructive op, rate limit)
+ *   - Flag a result (post-execute: suspicious output)
+ *
+ * Guardrails are synchronous checks, not AI calls. They run fast and never
+ * hit the network. The Runner calls them inline and either proceeds or
+ * aborts based on the result.
+ */
+
+import type { AiPlanInput, AiPlanOutput, PlannedStep } from '../planner/types';
+
+export type GuardrailPhase = 'pre-plan' | 'post-plan' | 'pre-execute' | 'post-execute';
+
+export interface GuardrailResult {
+	/** Whether the guardrail passed. */
+	readonly ok: boolean;
+	/** Human-readable reason if blocked. Shown in the iteration error. */
+	readonly reason?: string;
+	/** Optional severity: 'warn' logs but doesn't block, 'block' aborts. */
+	readonly severity?: 'warn' | 'block';
+}
+
+export interface PrePlanGuardrail {
+	readonly name: string;
+	readonly phase: 'pre-plan';
+	check(input: AiPlanInput): GuardrailResult;
+}
+
+export interface PostPlanGuardrail {
+	readonly name: string;
+	readonly phase: 'post-plan';
+	check(input: AiPlanInput, output: AiPlanOutput): GuardrailResult;
+}
+
+export interface PreExecuteGuardrail {
+	readonly name: string;
+	readonly phase: 'pre-execute';
+	check(step: PlannedStep): GuardrailResult;
+}
+
+export interface PostExecuteGuardrail {
+	readonly name: string;
+	readonly phase: 'post-execute';
+	check(step: PlannedStep, result: { success: boolean; data?: unknown }): GuardrailResult;
+}
+
+export type Guardrail =
+	| PrePlanGuardrail
+	| PostPlanGuardrail
+	| PreExecuteGuardrail
+	| PostExecuteGuardrail;
--- a/packages/shared-ai/src/index.ts
+++ b/packages/shared-ai/src/index.ts
@ -78,6 +78,19 @@ export {
 export type { ToolSchema } from './tools';
 export { AI_TOOL_CATALOG, AI_TOOL_CATALOG_BY_NAME } from './tools';

+export type {
+	Guardrail,
+	GuardrailPhase,
+	GuardrailResult,
+	GuardrailCheckResult,
+} from './guardrails';
+export {
+	BUILTIN_GUARDRAILS,
+	runPrePlanGuardrails,
+	runPostPlanGuardrails,
+	runPreExecuteGuardrails,
+} from './guardrails';
+
 export type {
 	Agent,
 	AgentState,
--- a/packages/shared-ai/src/tools/schemas.test.ts
+++ b/packages/shared-ai/src/tools/schemas.test.ts
@ -32,11 +32,12 @@ describe('AI_TOOL_CATALOG', () => {
 		}
 	});

-	it('has the expected propose and auto tool counts', () => {
+	it('has both propose and auto tools', () => {
 		const propose = AI_TOOL_CATALOG.filter((t) => t.defaultPolicy === 'propose');
 		const auto = AI_TOOL_CATALOG.filter((t) => t.defaultPolicy === 'auto');
-		expect(propose.length).toBe(17);
-		expect(auto.length).toBe(12);
+		expect(propose.length).toBeGreaterThan(0);
+		expect(auto.length).toBeGreaterThan(0);
+		expect(propose.length + auto.length).toBe(AI_TOOL_CATALOG.length);
 	});

 	it('by-name map has same size as catalog', () => {