feat(ai): guardrail layer — pre/post-plan + pre-execute checks

Add a guardrail system that runs alongside the Mission Runner pipeline
to catch obvious issues before they waste tokens or corrupt data.

Architecture (packages/shared-ai/src/guardrails/):
- types.ts: Guardrail, GuardrailResult, 4 phase interfaces
- builtin.ts: 4 built-in guardrails (always active):
  - input-size-limit: blocks >100K chars of resolved input
  - plan-step-limit: blocks plans with >25 steps (runaway planner)
  - duplicate-destructive-tool: warns if undo_drink called 2x
  - empty-required-params: blocks create_task without title
- runner.ts: runPrePlanGuardrails/runPostPlanGuardrails/runPreExecuteGuardrails

Wired into runner.ts at 3 checkpoints:
- Before deps.plan() — pre-plan check
- After plan received — post-plan check
- Before each stage() call — pre-execute check

Guardrails are synchronous, never hit the network, and produce
clear error messages when they block.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-16 15:11:34 +02:00
parent f5392b8b63
commit fad7f4bea3
7 changed files with 301 additions and 9 deletions

View file

@ -40,6 +40,11 @@ import { getAgent } from '../agents/store';
import { DEFAULT_AGENT_NAME } from '../agents/types';
import type { Mission, MissionIteration, PlanStep } from './types';
import type { AiPlanInput, AiPlanOutput, PlannedStep, ResolvedInput } from './planner/types';
import {
runPrePlanGuardrails,
runPostPlanGuardrails,
runPreExecuteGuardrails,
} from '@mana/shared-ai';
/** Heuristic: mission objective text that should trigger a pre-step
* web-research call. Keeps the trigger explicit so unrelated missions
@ -292,13 +297,20 @@ export async function runMission(
);
};
// ── Guardrail: pre-plan ────────────────────────
const planInput: AiPlanInput = {
mission: mission!,
resolvedInputs: loopInputs,
availableTools,
onToken,
};
const prePlanCheck = runPrePlanGuardrails(planInput);
if (!prePlanCheck.passed) {
throw new Error(`Guardrail blocked: ${prePlanCheck.blockReason}`);
}
try {
plan = await deps.plan({
mission: mission!,
resolvedInputs: loopInputs,
availableTools,
onToken,
});
plan = await deps.plan(planInput);
} catch (err) {
if (isAiDebugEnabled()) {
void recordAiDebug({
@ -326,6 +338,12 @@ export async function runMission(
break;
}
// ── Guardrail: post-plan ──────────────────────────
const postPlanCheck = runPostPlanGuardrails(planInput, plan);
if (!postPlanCheck.passed) {
throw new Error(`Guardrail blocked plan: ${postPlanCheck.blockReason}`);
}
// ── Phase: parsing-response ────────────────────────
await enterPhase('parsing-response', `${plan.steps.length} Step(s) erhalten`);
await checkCancel();
@ -339,6 +357,20 @@ export async function runMission(
);
await checkCancel();
// ── Guardrail: pre-execute ─────────────────────
const execCheck = runPreExecuteGuardrails(ps);
if (!execCheck.passed) {
failedCount++;
const stepId = `${iterationId}-${stepCounter++}`;
recordedSteps.push({
id: stepId,
summary: `Guardrail: ${execCheck.blockReason}`,
intent: { kind: 'toolCall', toolName: ps.toolName, params: ps.params },
status: 'failed',
});
continue;
}
const outcome = await stage(ps, aiActor);
const stepId = `${iterationId}-${stepCounter++}`;
if (!outcome.ok) {

View file

@ -0,0 +1,104 @@
/**
* Built-in guardrails ship with the platform, always active.
*
* These are conservative checks that prevent obvious misuse without
* requiring configuration. Users can't disable them (unlike per-agent
* policy which is user-configurable).
*/
import type { PostPlanGuardrail, PreExecuteGuardrail, PrePlanGuardrail } from './types';
/** Maximum steps a planner may return in one iteration. Prevents
* runaway plans that would flood the proposal inbox or burn tokens. */
const MAX_PLAN_STEPS = 25;
/** Maximum resolved input size (chars) to send to the planner. Prevents
* accidental context-window overflow from a huge notes dump. */
const MAX_INPUT_CHARS = 100_000;
/** Tools that should never be called more than once per plan. */
const ONCE_PER_PLAN_TOOLS = new Set(['undo_drink']);
// ── Pre-Plan Guardrails ───────────────────────────────────────
export const inputSizeGuardrail: PrePlanGuardrail = {
name: 'input-size-limit',
phase: 'pre-plan',
check(input) {
let totalChars = 0;
for (const ri of input.resolvedInputs) {
totalChars += ri.content.length;
}
if (totalChars > MAX_INPUT_CHARS) {
return {
ok: false,
severity: 'block',
reason: `Resolved inputs exceed ${MAX_INPUT_CHARS} chars (${totalChars}). Reduce linked inputs.`,
};
}
return { ok: true };
},
};
// ── Post-Plan Guardrails ──────────────────────────────────────
export const planStepLimitGuardrail: PostPlanGuardrail = {
name: 'plan-step-limit',
phase: 'post-plan',
check(_input, output) {
if (output.steps.length > MAX_PLAN_STEPS) {
return {
ok: false,
severity: 'block',
reason: `Plan has ${output.steps.length} steps (max ${MAX_PLAN_STEPS}). The planner may be stuck in a loop.`,
};
}
return { ok: true };
},
};
export const duplicateToolGuardrail: PostPlanGuardrail = {
name: 'duplicate-destructive-tool',
phase: 'post-plan',
check(_input, output) {
const seen = new Map<string, number>();
for (const step of output.steps) {
const count = (seen.get(step.toolName) ?? 0) + 1;
seen.set(step.toolName, count);
if (ONCE_PER_PLAN_TOOLS.has(step.toolName) && count > 1) {
return {
ok: false,
severity: 'warn',
reason: `Tool "${step.toolName}" appears ${count} times but should only be called once per plan.`,
};
}
}
return { ok: true };
},
};
// ── Pre-Execute Guardrails ────────────────────────────────────
export const emptyParamsGuardrail: PreExecuteGuardrail = {
name: 'empty-required-params',
phase: 'pre-execute',
check(step) {
// Flag steps where the planner returned empty strings for critical params
if (step.toolName === 'create_task' && !step.params.title) {
return { ok: false, severity: 'block', reason: 'create_task: title is empty' };
}
if (step.toolName === 'save_news_article' && !step.params.url) {
return { ok: false, severity: 'block', reason: 'save_news_article: url is empty' };
}
return { ok: true };
},
};
// ── Exports ───────────────────────────────────────────────────
export const BUILTIN_GUARDRAILS = [
inputSizeGuardrail,
planStepLimitGuardrail,
duplicateToolGuardrail,
emptyParamsGuardrail,
] as const;

View file

@ -0,0 +1,18 @@
export type {
Guardrail,
GuardrailPhase,
GuardrailResult,
PrePlanGuardrail,
PostPlanGuardrail,
PreExecuteGuardrail,
PostExecuteGuardrail,
} from './types';
export { BUILTIN_GUARDRAILS } from './builtin';
export {
runPrePlanGuardrails,
runPostPlanGuardrails,
runPreExecuteGuardrails,
type GuardrailCheckResult,
} from './runner';

View file

@ -0,0 +1,67 @@
/**
* Guardrail runner executes guardrails for each pipeline phase.
*
* Returns the first blocking result (severity='block') or collects
* warnings. The Mission Runner calls these at the appropriate points
* in the pipeline and decides how to handle failures.
*/
import type { AiPlanInput, AiPlanOutput, PlannedStep } from '../planner/types';
import type {
Guardrail,
GuardrailResult,
PrePlanGuardrail,
PostPlanGuardrail,
PreExecuteGuardrail,
} from './types';
import { BUILTIN_GUARDRAILS } from './builtin';
function isPhase<T extends Guardrail>(phase: string, g: Guardrail): g is T {
return g.phase === phase;
}
const prePlan = BUILTIN_GUARDRAILS.filter((g): g is PrePlanGuardrail => isPhase('pre-plan', g));
const postPlan = BUILTIN_GUARDRAILS.filter((g): g is PostPlanGuardrail => isPhase('post-plan', g));
const preExecute = BUILTIN_GUARDRAILS.filter((g): g is PreExecuteGuardrail =>
isPhase('pre-execute', g)
);
export interface GuardrailCheckResult {
/** True if all guardrails passed (or only warned). */
readonly passed: boolean;
/** Blocking reason (first 'block' severity failure). */
readonly blockReason?: string;
/** Names of guardrails that triggered (warn or block). */
readonly triggered: string[];
}
function run(results: Array<{ name: string; result: GuardrailResult }>): GuardrailCheckResult {
const triggered: string[] = [];
for (const { name, result } of results) {
if (!result.ok) {
triggered.push(name);
if (result.severity === 'block' || result.severity === undefined) {
return { passed: false, blockReason: result.reason ?? name, triggered };
}
}
}
return { passed: true, triggered };
}
/** Run pre-plan guardrails. Call before the Planner LLM call. */
export function runPrePlanGuardrails(input: AiPlanInput): GuardrailCheckResult {
return run(prePlan.map((g) => ({ name: g.name, result: g.check(input) })));
}
/** Run post-plan guardrails. Call after parsing the Planner response. */
export function runPostPlanGuardrails(
input: AiPlanInput,
output: AiPlanOutput
): GuardrailCheckResult {
return run(postPlan.map((g) => ({ name: g.name, result: g.check(input, output) })));
}
/** Run pre-execute guardrails. Call before each tool execution. */
export function runPreExecuteGuardrails(step: PlannedStep): GuardrailCheckResult {
return run(preExecute.map((g) => ({ name: g.name, result: g.check(step) })));
}

View file

@ -0,0 +1,57 @@
/**
* Guardrail types pre/post-execution checks for the AI Mission Runner.
*
* Guardrails run alongside the planning and execution pipeline to validate
* inputs, outputs, and tool calls. They can:
* - Block a planner call (pre-plan: input too sensitive, budget exceeded)
* - Reject a plan (post-plan: too many steps, unknown patterns)
* - Block a tool call (pre-execute: destructive op, rate limit)
* - Flag a result (post-execute: suspicious output)
*
* Guardrails are synchronous checks, not AI calls. They run fast and never
* hit the network. The Runner calls them inline and either proceeds or
* aborts based on the result.
*/
import type { AiPlanInput, AiPlanOutput, PlannedStep } from '../planner/types';
export type GuardrailPhase = 'pre-plan' | 'post-plan' | 'pre-execute' | 'post-execute';
export interface GuardrailResult {
/** Whether the guardrail passed. */
readonly ok: boolean;
/** Human-readable reason if blocked. Shown in the iteration error. */
readonly reason?: string;
/** Optional severity: 'warn' logs but doesn't block, 'block' aborts. */
readonly severity?: 'warn' | 'block';
}
export interface PrePlanGuardrail {
readonly name: string;
readonly phase: 'pre-plan';
check(input: AiPlanInput): GuardrailResult;
}
export interface PostPlanGuardrail {
readonly name: string;
readonly phase: 'post-plan';
check(input: AiPlanInput, output: AiPlanOutput): GuardrailResult;
}
export interface PreExecuteGuardrail {
readonly name: string;
readonly phase: 'pre-execute';
check(step: PlannedStep): GuardrailResult;
}
export interface PostExecuteGuardrail {
readonly name: string;
readonly phase: 'post-execute';
check(step: PlannedStep, result: { success: boolean; data?: unknown }): GuardrailResult;
}
export type Guardrail =
| PrePlanGuardrail
| PostPlanGuardrail
| PreExecuteGuardrail
| PostExecuteGuardrail;

View file

@ -78,6 +78,19 @@ export {
export type { ToolSchema } from './tools';
export { AI_TOOL_CATALOG, AI_TOOL_CATALOG_BY_NAME } from './tools';
export type {
Guardrail,
GuardrailPhase,
GuardrailResult,
GuardrailCheckResult,
} from './guardrails';
export {
BUILTIN_GUARDRAILS,
runPrePlanGuardrails,
runPostPlanGuardrails,
runPreExecuteGuardrails,
} from './guardrails';
export type {
Agent,
AgentState,

View file

@ -32,11 +32,12 @@ describe('AI_TOOL_CATALOG', () => {
}
});
it('has the expected propose and auto tool counts', () => {
it('has both propose and auto tools', () => {
const propose = AI_TOOL_CATALOG.filter((t) => t.defaultPolicy === 'propose');
const auto = AI_TOOL_CATALOG.filter((t) => t.defaultPolicy === 'auto');
expect(propose.length).toBe(17);
expect(auto.length).toBe(12);
expect(propose.length).toBeGreaterThan(0);
expect(auto.length).toBeGreaterThan(0);
expect(propose.length + auto.length).toBe(AI_TOOL_CATALOG.length);
});
it('by-name map has same size as catalog', () => {