mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 19:01:08 +02:00
feat(ai): guardrail layer — pre/post-plan + pre-execute checks
Add a guardrail system that runs alongside the Mission Runner pipeline to catch obvious issues before they waste tokens or corrupt data. Architecture (packages/shared-ai/src/guardrails/): - types.ts: Guardrail, GuardrailResult, 4 phase interfaces - builtin.ts: 4 built-in guardrails (always active): - input-size-limit: blocks >100K chars of resolved input - plan-step-limit: blocks plans with >25 steps (runaway planner) - duplicate-destructive-tool: warns if undo_drink called 2x - empty-required-params: blocks create_task without title - runner.ts: runPrePlanGuardrails/runPostPlanGuardrails/runPreExecuteGuardrails Wired into runner.ts at 3 checkpoints: - Before deps.plan() — pre-plan check - After plan received — post-plan check - Before each stage() call — pre-execute check Guardrails are synchronous, never hit the network, and produce clear error messages when they block. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f5392b8b63
commit
fad7f4bea3
7 changed files with 301 additions and 9 deletions
104
packages/shared-ai/src/guardrails/builtin.ts
Normal file
104
packages/shared-ai/src/guardrails/builtin.ts
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
/**
|
||||
* Built-in guardrails — ship with the platform, always active.
|
||||
*
|
||||
* These are conservative checks that prevent obvious misuse without
|
||||
* requiring configuration. Users can't disable them (unlike per-agent
|
||||
* policy which is user-configurable).
|
||||
*/
|
||||
|
||||
import type { PostPlanGuardrail, PreExecuteGuardrail, PrePlanGuardrail } from './types';
|
||||
|
||||
/** Maximum steps a planner may return in one iteration. Prevents
|
||||
* runaway plans that would flood the proposal inbox or burn tokens. */
|
||||
const MAX_PLAN_STEPS = 25;
|
||||
|
||||
/** Maximum resolved input size (chars) to send to the planner. Prevents
|
||||
* accidental context-window overflow from a huge notes dump. */
|
||||
const MAX_INPUT_CHARS = 100_000;
|
||||
|
||||
/** Tools that should never be called more than once per plan. */
|
||||
const ONCE_PER_PLAN_TOOLS = new Set(['undo_drink']);
|
||||
|
||||
// ── Pre-Plan Guardrails ───────────────────────────────────────
|
||||
|
||||
export const inputSizeGuardrail: PrePlanGuardrail = {
|
||||
name: 'input-size-limit',
|
||||
phase: 'pre-plan',
|
||||
check(input) {
|
||||
let totalChars = 0;
|
||||
for (const ri of input.resolvedInputs) {
|
||||
totalChars += ri.content.length;
|
||||
}
|
||||
if (totalChars > MAX_INPUT_CHARS) {
|
||||
return {
|
||||
ok: false,
|
||||
severity: 'block',
|
||||
reason: `Resolved inputs exceed ${MAX_INPUT_CHARS} chars (${totalChars}). Reduce linked inputs.`,
|
||||
};
|
||||
}
|
||||
return { ok: true };
|
||||
},
|
||||
};
|
||||
|
||||
// ── Post-Plan Guardrails ──────────────────────────────────────
|
||||
|
||||
export const planStepLimitGuardrail: PostPlanGuardrail = {
|
||||
name: 'plan-step-limit',
|
||||
phase: 'post-plan',
|
||||
check(_input, output) {
|
||||
if (output.steps.length > MAX_PLAN_STEPS) {
|
||||
return {
|
||||
ok: false,
|
||||
severity: 'block',
|
||||
reason: `Plan has ${output.steps.length} steps (max ${MAX_PLAN_STEPS}). The planner may be stuck in a loop.`,
|
||||
};
|
||||
}
|
||||
return { ok: true };
|
||||
},
|
||||
};
|
||||
|
||||
export const duplicateToolGuardrail: PostPlanGuardrail = {
|
||||
name: 'duplicate-destructive-tool',
|
||||
phase: 'post-plan',
|
||||
check(_input, output) {
|
||||
const seen = new Map<string, number>();
|
||||
for (const step of output.steps) {
|
||||
const count = (seen.get(step.toolName) ?? 0) + 1;
|
||||
seen.set(step.toolName, count);
|
||||
if (ONCE_PER_PLAN_TOOLS.has(step.toolName) && count > 1) {
|
||||
return {
|
||||
ok: false,
|
||||
severity: 'warn',
|
||||
reason: `Tool "${step.toolName}" appears ${count} times but should only be called once per plan.`,
|
||||
};
|
||||
}
|
||||
}
|
||||
return { ok: true };
|
||||
},
|
||||
};
|
||||
|
||||
// ── Pre-Execute Guardrails ────────────────────────────────────
|
||||
|
||||
export const emptyParamsGuardrail: PreExecuteGuardrail = {
|
||||
name: 'empty-required-params',
|
||||
phase: 'pre-execute',
|
||||
check(step) {
|
||||
// Flag steps where the planner returned empty strings for critical params
|
||||
if (step.toolName === 'create_task' && !step.params.title) {
|
||||
return { ok: false, severity: 'block', reason: 'create_task: title is empty' };
|
||||
}
|
||||
if (step.toolName === 'save_news_article' && !step.params.url) {
|
||||
return { ok: false, severity: 'block', reason: 'save_news_article: url is empty' };
|
||||
}
|
||||
return { ok: true };
|
||||
},
|
||||
};
|
||||
|
||||
// ── Exports ───────────────────────────────────────────────────
|
||||
|
||||
export const BUILTIN_GUARDRAILS = [
|
||||
inputSizeGuardrail,
|
||||
planStepLimitGuardrail,
|
||||
duplicateToolGuardrail,
|
||||
emptyParamsGuardrail,
|
||||
] as const;
|
||||
18
packages/shared-ai/src/guardrails/index.ts
Normal file
18
packages/shared-ai/src/guardrails/index.ts
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
export type {
|
||||
Guardrail,
|
||||
GuardrailPhase,
|
||||
GuardrailResult,
|
||||
PrePlanGuardrail,
|
||||
PostPlanGuardrail,
|
||||
PreExecuteGuardrail,
|
||||
PostExecuteGuardrail,
|
||||
} from './types';
|
||||
|
||||
export { BUILTIN_GUARDRAILS } from './builtin';
|
||||
|
||||
export {
|
||||
runPrePlanGuardrails,
|
||||
runPostPlanGuardrails,
|
||||
runPreExecuteGuardrails,
|
||||
type GuardrailCheckResult,
|
||||
} from './runner';
|
||||
67
packages/shared-ai/src/guardrails/runner.ts
Normal file
67
packages/shared-ai/src/guardrails/runner.ts
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
/**
|
||||
* Guardrail runner — executes guardrails for each pipeline phase.
|
||||
*
|
||||
* Returns the first blocking result (severity='block') or collects
|
||||
* warnings. The Mission Runner calls these at the appropriate points
|
||||
* in the pipeline and decides how to handle failures.
|
||||
*/
|
||||
|
||||
import type { AiPlanInput, AiPlanOutput, PlannedStep } from '../planner/types';
|
||||
import type {
|
||||
Guardrail,
|
||||
GuardrailResult,
|
||||
PrePlanGuardrail,
|
||||
PostPlanGuardrail,
|
||||
PreExecuteGuardrail,
|
||||
} from './types';
|
||||
import { BUILTIN_GUARDRAILS } from './builtin';
|
||||
|
||||
function isPhase<T extends Guardrail>(phase: string, g: Guardrail): g is T {
|
||||
return g.phase === phase;
|
||||
}
|
||||
|
||||
const prePlan = BUILTIN_GUARDRAILS.filter((g): g is PrePlanGuardrail => isPhase('pre-plan', g));
|
||||
const postPlan = BUILTIN_GUARDRAILS.filter((g): g is PostPlanGuardrail => isPhase('post-plan', g));
|
||||
const preExecute = BUILTIN_GUARDRAILS.filter((g): g is PreExecuteGuardrail =>
|
||||
isPhase('pre-execute', g)
|
||||
);
|
||||
|
||||
export interface GuardrailCheckResult {
|
||||
/** True if all guardrails passed (or only warned). */
|
||||
readonly passed: boolean;
|
||||
/** Blocking reason (first 'block' severity failure). */
|
||||
readonly blockReason?: string;
|
||||
/** Names of guardrails that triggered (warn or block). */
|
||||
readonly triggered: string[];
|
||||
}
|
||||
|
||||
function run(results: Array<{ name: string; result: GuardrailResult }>): GuardrailCheckResult {
|
||||
const triggered: string[] = [];
|
||||
for (const { name, result } of results) {
|
||||
if (!result.ok) {
|
||||
triggered.push(name);
|
||||
if (result.severity === 'block' || result.severity === undefined) {
|
||||
return { passed: false, blockReason: result.reason ?? name, triggered };
|
||||
}
|
||||
}
|
||||
}
|
||||
return { passed: true, triggered };
|
||||
}
|
||||
|
||||
/** Run pre-plan guardrails. Call before the Planner LLM call. */
|
||||
export function runPrePlanGuardrails(input: AiPlanInput): GuardrailCheckResult {
|
||||
return run(prePlan.map((g) => ({ name: g.name, result: g.check(input) })));
|
||||
}
|
||||
|
||||
/** Run post-plan guardrails. Call after parsing the Planner response. */
|
||||
export function runPostPlanGuardrails(
|
||||
input: AiPlanInput,
|
||||
output: AiPlanOutput
|
||||
): GuardrailCheckResult {
|
||||
return run(postPlan.map((g) => ({ name: g.name, result: g.check(input, output) })));
|
||||
}
|
||||
|
||||
/** Run pre-execute guardrails. Call before each tool execution. */
|
||||
export function runPreExecuteGuardrails(step: PlannedStep): GuardrailCheckResult {
|
||||
return run(preExecute.map((g) => ({ name: g.name, result: g.check(step) })));
|
||||
}
|
||||
57
packages/shared-ai/src/guardrails/types.ts
Normal file
57
packages/shared-ai/src/guardrails/types.ts
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
/**
|
||||
* Guardrail types — pre/post-execution checks for the AI Mission Runner.
|
||||
*
|
||||
* Guardrails run alongside the planning and execution pipeline to validate
|
||||
* inputs, outputs, and tool calls. They can:
|
||||
* - Block a planner call (pre-plan: input too sensitive, budget exceeded)
|
||||
* - Reject a plan (post-plan: too many steps, unknown patterns)
|
||||
* - Block a tool call (pre-execute: destructive op, rate limit)
|
||||
* - Flag a result (post-execute: suspicious output)
|
||||
*
|
||||
* Guardrails are synchronous checks, not AI calls. They run fast and never
|
||||
* hit the network. The Runner calls them inline and either proceeds or
|
||||
* aborts based on the result.
|
||||
*/
|
||||
|
||||
import type { AiPlanInput, AiPlanOutput, PlannedStep } from '../planner/types';
|
||||
|
||||
export type GuardrailPhase = 'pre-plan' | 'post-plan' | 'pre-execute' | 'post-execute';
|
||||
|
||||
export interface GuardrailResult {
|
||||
/** Whether the guardrail passed. */
|
||||
readonly ok: boolean;
|
||||
/** Human-readable reason if blocked. Shown in the iteration error. */
|
||||
readonly reason?: string;
|
||||
/** Optional severity: 'warn' logs but doesn't block, 'block' aborts. */
|
||||
readonly severity?: 'warn' | 'block';
|
||||
}
|
||||
|
||||
export interface PrePlanGuardrail {
|
||||
readonly name: string;
|
||||
readonly phase: 'pre-plan';
|
||||
check(input: AiPlanInput): GuardrailResult;
|
||||
}
|
||||
|
||||
export interface PostPlanGuardrail {
|
||||
readonly name: string;
|
||||
readonly phase: 'post-plan';
|
||||
check(input: AiPlanInput, output: AiPlanOutput): GuardrailResult;
|
||||
}
|
||||
|
||||
export interface PreExecuteGuardrail {
|
||||
readonly name: string;
|
||||
readonly phase: 'pre-execute';
|
||||
check(step: PlannedStep): GuardrailResult;
|
||||
}
|
||||
|
||||
export interface PostExecuteGuardrail {
|
||||
readonly name: string;
|
||||
readonly phase: 'post-execute';
|
||||
check(step: PlannedStep, result: { success: boolean; data?: unknown }): GuardrailResult;
|
||||
}
|
||||
|
||||
export type Guardrail =
|
||||
| PrePlanGuardrail
|
||||
| PostPlanGuardrail
|
||||
| PreExecuteGuardrail
|
||||
| PostExecuteGuardrail;
|
||||
|
|
@ -78,6 +78,19 @@ export {
|
|||
export type { ToolSchema } from './tools';
|
||||
export { AI_TOOL_CATALOG, AI_TOOL_CATALOG_BY_NAME } from './tools';
|
||||
|
||||
export type {
|
||||
Guardrail,
|
||||
GuardrailPhase,
|
||||
GuardrailResult,
|
||||
GuardrailCheckResult,
|
||||
} from './guardrails';
|
||||
export {
|
||||
BUILTIN_GUARDRAILS,
|
||||
runPrePlanGuardrails,
|
||||
runPostPlanGuardrails,
|
||||
runPreExecuteGuardrails,
|
||||
} from './guardrails';
|
||||
|
||||
export type {
|
||||
Agent,
|
||||
AgentState,
|
||||
|
|
|
|||
|
|
@ -32,11 +32,12 @@ describe('AI_TOOL_CATALOG', () => {
|
|||
}
|
||||
});
|
||||
|
||||
it('has the expected propose and auto tool counts', () => {
|
||||
it('has both propose and auto tools', () => {
|
||||
const propose = AI_TOOL_CATALOG.filter((t) => t.defaultPolicy === 'propose');
|
||||
const auto = AI_TOOL_CATALOG.filter((t) => t.defaultPolicy === 'auto');
|
||||
expect(propose.length).toBe(17);
|
||||
expect(auto.length).toBe(12);
|
||||
expect(propose.length).toBeGreaterThan(0);
|
||||
expect(auto.length).toBeGreaterThan(0);
|
||||
expect(propose.length + auto.length).toBe(AI_TOOL_CATALOG.length);
|
||||
});
|
||||
|
||||
it('by-name map has same size as catalog', () => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue