fix(personas): exact tool_use_id pairing + CI drift audit

Two loose ends from M3/M4: 1. Tool_use_id-based error attribution in the persona-runner ----------------------------------------------------------- The previous collectActionsFromMessage() flipped the *most recent* ActionRow to 'error' when a tool_result carried is_error:true. That was fine as long as Claude invoked tools strictly in sequence, but when the planner pipelines multiple tools in one turn, a later tool_result carries an earlier tool_use_id — the last-action fallback mis- attributes the error. runMainTurn() now keeps a tool_use_id → action-index Map for the duration of the tick. On tool_use we stash block.id, on tool_result we look up the exact ActionRow via tool_use_id and flip that one. The "flip last" path survives as a pure fallback if a future SDK ever ships a block without an id. 2. New audit:encrypted-tools script ----------------------------------- scripts/audit-encrypted-tools.ts — loads registerAllModules() and apps/mana/…/crypto/registry.ts, diffs every ToolSpec.encryptedFields against the authoritative web-app ENCRYPTION_REGISTRY. Catches three classes of drift: - missing-table : tool declares a table the web-app doesn't encrypt - field-drift : both agree a table is encrypted but the field lists differ (half-encryption in the wire is silent death) - disabled : web-app has enabled:false while the tool still encrypts — advisory warning, not a fail Negative-tested by injecting a deliberate drift on todo.create + todo.list (shortened ENCRYPTED_FIELDS to ['title']); the auditor flagged both tools with full field diffs, restore returned to green. Wired into `pnpm run validate:all` so the contract survives future edits on either side. Fills the M4 audit gap noted in project_mana_mcp_personas.md. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 19:01:08 +02:00 · 2026-04-23 15:34:52 +02:00 · 2026-04-23 15:34:52 +02:00 · eb8fac23ec
commit eb8fac23ec
parent 703ef69ca9
3 changed files with 150 additions and 12 deletions
--- a/package.json
+++ b/package.json
@ -24,9 +24,10 @@
 		"validate:theme-variables": "node scripts/validate-theme-variables.mjs",
 		"validate:theme-utilities": "node scripts/validate-theme-utilities.mjs",
 		"validate:theme-parity": "node scripts/validate-theme-parity.mjs",
-		"validate:all": "pnpm run validate:turbo && pnpm run validate:pg-schema && pnpm run validate:theme-variables && pnpm run validate:theme-utilities && pnpm run validate:theme-parity && pnpm run check:crypto",
+		"validate:all": "pnpm run validate:turbo && pnpm run validate:pg-schema && pnpm run validate:theme-variables && pnpm run validate:theme-utilities && pnpm run validate:theme-parity && pnpm run check:crypto && pnpm run audit:encrypted-tools",
 		"check:crypto": "node scripts/audit-crypto-registry.mjs",
 		"check:crypto:seed": "node scripts/audit-crypto-registry.mjs --seed",
 		"audit:encrypted-tools": "bun run scripts/audit-encrypted-tools.ts",
 		"audit:deps": "node scripts/audit-workspace-deps.mjs",
 		"audit:modules": "node scripts/audit-modules.mjs",
 		"audit:coupling": "node scripts/audit-module-coupling.mjs",
--- a/scripts/audit-encrypted-tools.ts
+++ b/scripts/audit-encrypted-tools.ts
@ -0,0 +1,124 @@
 #!/usr/bin/env bun
 /**
 * Audit the encryption-fields contract between the web-app
 * `ENCRYPTION_REGISTRY` (Dexie hook) and each `ToolSpec.encryptedFields`
 * declaration in `@mana/tool-registry`.
 *
 * Why: when a server-side tool encrypts fewer fields than the web app,
 * records written by the persona-runner look "encrypted enough" on the
 * wire but arrive plaintext in the columns the web app expected to
 * encrypt — silent data leak into sync rows. When the tool encrypts
 * MORE than the web app, the web app can't decrypt its own reads.
 *
 * Invariants:
 *   1. For every ToolSpec with `encryptedFields: {table, fields}`, the
 *      web-app registry has an entry for `table`.
 *   2. The set of fields declared on the spec matches exactly — same
 *      names, same cardinality. Extra / missing fields on either side
 *      fail the audit.
 *   3. (Advisory) If the web-app registry marks a table as
 *      `enabled: false`, encrypted tool handlers are pointless — flag
 *      but don't fail (opt-in rollout may intentionally stage these).
 *
 * Usage:
 *   bun run scripts/audit-encrypted-tools.ts            # exit 1 on drift
 *   pnpm run audit:encrypted-tools                      # same
 *
 * Wired into `pnpm run validate:all`.
 */
 import {
 	registerAllModules,
 	getRegistry,
 	__resetRegistryForTests,
 } from '../packages/mana-tool-registry/src/index.ts';
 import { ENCRYPTION_REGISTRY } from '../apps/mana/apps/web/src/lib/data/crypto/registry.ts';
 interface Violation {
 	tool: string;
 	kind: 'missing-table' | 'field-drift' | 'disabled';
 	detail: string;
 }
 function auditEncryptedTools(): Violation[] {
 	__resetRegistryForTests();
 	registerAllModules();
 	const violations: Violation[] = [];
 	for (const tool of getRegistry()) {
 		if (!tool.encryptedFields) continue;
 		const { table, fields } = tool.encryptedFields;
 		const webAppEntry = ENCRYPTION_REGISTRY[table];
 		if (!webAppEntry) {
 			violations.push({
 				tool: tool.name,
 				kind: 'missing-table',
 				detail: `table "${table}" has no entry in apps/mana/.../crypto/registry.ts`,
 			});
 			continue;
 		}
 		const actual = [...fields].sort().join(',');
 		const expected = [...webAppEntry.fields].sort().join(',');
 		if (actual !== expected) {
 			violations.push({
 				tool: tool.name,
 				kind: 'field-drift',
 				detail:
 					`tool encrypts [${actual}], web-app encrypts [${expected}] — ` +
 					`both sides must agree or the record is half-encrypted`,
 			});
 			continue;
 		}
 		if (webAppEntry.enabled === false) {
 			violations.push({
 				tool: tool.name,
 				kind: 'disabled',
 				detail:
 					`web-app registry has enabled:false for "${table}" — ` +
 					`tool will encrypt but app will write/read plaintext, wire records will mix`,
 			});
 		}
 	}
 	return violations;
 }
 const violations = auditEncryptedTools();
 if (violations.length === 0) {
 	const total = getRegistry().filter((t) => t.encryptedFields).length;
 	console.log(`✓ encrypted-tools audit: ${total} tool(s) match the web-app registry.`);
 	process.exit(0);
 }
 // Split by kind for readability
 const fatals = violations.filter((v) => v.kind !== 'disabled');
 const warnings = violations.filter((v) => v.kind === 'disabled');
 if (warnings.length > 0) {
 	console.warn(`⚠ ${warnings.length} advisory warning(s):`);
 	for (const w of warnings) console.warn(`   · ${w.tool}: ${w.detail}`);
 	console.warn('');
 }
 if (fatals.length === 0) {
 	console.log('✓ no hard violations.');
 	process.exit(0);
 }
 console.error(`✗ encrypted-tools audit found ${fatals.length} violation(s):`);
 for (const v of fatals) {
 	console.error(`   · ${v.tool}  [${v.kind}]`);
 	console.error(`       ${v.detail}`);
 }
 console.error('');
 console.error(
 	'Fix: either update the ToolSpec.encryptedFields or the web-app registry — both must agree.'
 );
 process.exit(1);
--- a/services/mana-persona-runner/src/runner/claude-session.ts
+++ b/services/mana-persona-runner/src/runner/claude-session.ts
@ -96,8 +96,16 @@ export async function runMainTurn(input: SessionInput): Promise<SessionResult> {
 		},
 	});
 	// Per-turn Map so tool_result blocks can flip the *right* ActionRow
 	// (the one whose tool_use_id they reference), not "the most recent".
 	// Anthropic's stream interleaves tool_use and tool_result blocks
 	// deterministically, but when Claude pipelines multiple tools in one
 	// assistant turn, a later tool_result carries an earlier tool_use_id
 	// — the last-action fallback gets that wrong.
 	const toolUseIndex = new Map<string, number>();
 	for await (const msg of q as AsyncIterable<SDKMessage>) {
-		collectActionsFromMessage(msg, input.tickId, actions, modulesUsed);
+		collectActionsFromMessage(msg, input.tickId, actions, modulesUsed, toolUseIndex);
 	}
 	return { actions, feedback: [], modulesUsed };
@ -143,7 +151,8 @@ function collectActionsFromMessage(
 	msg: SDKMessage,
 	tickId: string,
 	actions: ActionRow[],
-	modulesUsed: Set<string>
+	modulesUsed: Set<string>,
 	toolUseIndex: Map<string, number>
 ): void {
 	// SDKMessage is a big union; we only care about assistant messages
 	// that contain tool_use blocks, and user messages that contain
@ -168,19 +177,23 @@ function collectActionsFromMessage(
 				inputHash: hashInput(block.input),
 				result: 'ok', // provisional; rewritten on matching tool_result if it was an error
 			});
 			// Record the Anthropic-assigned id so a later tool_result can
 			// find its row even if other tools finished in between.
 			const toolUseId = typeof block.id === 'string' ? block.id : null;
 			if (toolUseId) toolUseIndex.set(toolUseId, actions.length - 1);
 		} else if (blockType === 'tool_result') {
 			const isError = block.is_error === true;
 			if (!isError) continue;
 			// Flip the most recent action that matches this tool_use_id.
 			const toolUseId = typeof block.tool_use_id === 'string' ? block.tool_use_id : null;
-			if (!toolUseId) continue;
+			// Exact pairing via tool_use_id → fall back to last action only
-			// We didn't store tool_use_id (would require pairing state); cheap
+			// if the id is missing (shouldn't happen with the current SDK,
-			// fallback: mark the last action as error. Good enough for the
+			// but the fallback keeps the pipeline honest if Anthropic ever
-			// audit dashboard; precise attribution lands in a later iteration.
+			// ships a block without an id).
-			const last = actions[actions.length - 1];
+			const idx = toolUseId !== null ? toolUseIndex.get(toolUseId) : undefined;
-			if (last) {
+			const target = idx !== undefined ? actions[idx] : actions[actions.length - 1];
-				last.result = 'error';
+			if (target) {
-				last.errorMessage = stringifyBlock(block);
+				target.result = 'error';
 				target.errorMessage = stringifyBlock(block);
 			}
 		}
 	}