From c94ab01c69b41adfe94ab0b440869e598e359d3d Mon Sep 17 00:00:00 2001 From: Till JS Date: Thu, 23 Apr 2026 14:23:08 +0200 Subject: [PATCH] feat(mana-mcp): Prometheus metrics for policy gate + tool invocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the stub /metrics endpoint with a real prom-client registry (mana_mcp_ prefix, {service="mana-mcp"} default label). Default process metrics come along for free. Policy-gate telemetry is the whole point — without it we can't soak POLICY_MODE=log-only safely or decide when to flip to enforce. New counter mana_mcp_policy_decisions_total{decision, reason, mode} buckets every evaluatePolicy() call: decision ∈ {allow, deny, flagged} reason ∈ {admin-scope-not-invokable, destructive-not-allowed, rate-limit-exceeded, injection-marker, clean, unknown} mode ∈ {log-only, enforce} So the rate of "would have been denied" during soak is visible directly as policy_decisions_total{decision="deny", mode="log-only"}. Also: - mana_mcp_tool_invocations_total{tool, outcome} — success | handler-error | input-invalid. Policy denies are NOT counted here (they're in policy_decisions_total above); this counter only counts calls that actually reached the handler or tripped zod validation. - mana_mcp_tool_duration_seconds histogram per tool/outcome. Dep: prom-client ^15.1.3 (same version mana-ai pins). Co-Authored-By: Claude Opus 4.7 (1M context) --- services/mana-mcp/package.json | 1 + services/mana-mcp/src/index.ts | 8 ++-- services/mana-mcp/src/mcp-adapter.ts | 23 +++++++++++ services/mana-mcp/src/metrics.ts | 60 ++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 services/mana-mcp/src/metrics.ts diff --git a/services/mana-mcp/package.json b/services/mana-mcp/package.json index bc7ba2d10..1c825da9d 100644 --- a/services/mana-mcp/package.json +++ b/services/mana-mcp/package.json @@ -15,6 +15,7 @@ "@modelcontextprotocol/sdk": "^1.29.0", "hono": "^4.7.0", "jose": "^6.1.2", + "prom-client": "^15.1.3", "zod": "^3.25.76" }, "devDependencies": { diff --git a/services/mana-mcp/src/index.ts b/services/mana-mcp/src/index.ts index dbba774cb..0183daa01 100644 --- a/services/mana-mcp/src/index.ts +++ b/services/mana-mcp/src/index.ts @@ -14,6 +14,7 @@ import { registerAllModules } from '@mana/tool-registry'; import { loadConfig } from './config.ts'; import { authenticateRequest, UnauthorizedError } from './auth.ts'; import { handleMcpRequest } from './transport.ts'; +import { register as metricsRegistry } from './metrics.ts'; // ─── Bootstrap ──────────────────────────────────────────────────── @@ -42,9 +43,10 @@ app.get('/health', (c) => }) ); -app.get('/metrics', (c) => - c.text('# mana-mcp metrics stub — populated alongside Persona-Runner observability\n') -); +app.get('/metrics', async (c) => { + const body = await metricsRegistry.metrics(); + return c.text(body, 200, { 'content-type': metricsRegistry.contentType }); +}); // ─── MCP endpoint ───────────────────────────────────────────────── diff --git a/services/mana-mcp/src/mcp-adapter.ts b/services/mana-mcp/src/mcp-adapter.ts index e8a1b7be5..204b867ce 100644 --- a/services/mana-mcp/src/mcp-adapter.ts +++ b/services/mana-mcp/src/mcp-adapter.ts @@ -25,6 +25,7 @@ import { import type { VerifiedUser } from './auth.ts'; import type { Config } from './config.ts'; import { appendInvocation, getRecentInvocations } from './invocation-log.ts'; +import { policyDecisionsTotal, toolDuration, toolInvocationsTotal } from './metrics.ts'; /** * Shared across all sessions — the client caches MKs per userId with a @@ -109,6 +110,7 @@ export function createMcpServerForUser(user: VerifiedUser, config: Config): McpS err instanceof z.ZodError ? err.issues.map((i) => `${i.path.join('.')}: ${i.message}`).join('; ') : String(err); + toolInvocationsTotal.inc({ tool: spec.name, outcome: 'input-invalid' }); return { isError: true, content: [{ type: 'text' as const, text: `Invalid input for ${spec.name}: ${msg}` }], @@ -129,6 +131,11 @@ export function createMcpServerForUser(user: VerifiedUser, config: Config): McpS }); if (!decision.allow) { + policyDecisionsTotal.inc({ + decision: 'deny', + reason: decision.reason ?? 'unknown', + mode: config.policyMode, + }); const label = config.policyMode === 'enforce' ? 'DENY' : 'WOULD-DENY'; console.warn( `[mana-mcp policy] ${label} tool=${spec.name} user=${user.userId.slice(0, 8)} reason=${decision.reason}` @@ -143,7 +150,18 @@ export function createMcpServerForUser(user: VerifiedUser, config: Config): McpS }; } } else if (decision.reminder) { + policyDecisionsTotal.inc({ + decision: 'flagged', + reason: 'injection-marker', + mode: config.policyMode, + }); console.info(`[mana-mcp policy] FLAG tool=${spec.name} user=${user.userId.slice(0, 8)}`); + } else { + policyDecisionsTotal.inc({ + decision: 'allow', + reason: 'clean', + mode: config.policyMode, + }); } } @@ -151,13 +169,18 @@ export function createMcpServerForUser(user: VerifiedUser, config: Config): McpS // handler's duration doesn't open a rate-limit gap. appendInvocation(user.userId, spec.name); + const endTimer = toolDuration.startTimer({ tool: spec.name, outcome: 'success' }); try { const result = await spec.handler(parsed, ctxFor(spec.name)); + toolInvocationsTotal.inc({ tool: spec.name, outcome: 'success' }); + endTimer({ tool: spec.name, outcome: 'success' }); return { content: [{ type: 'text' as const, text: JSON.stringify(result, null, 2) }], }; } catch (err) { const msg = err instanceof Error ? err.message : String(err); + toolInvocationsTotal.inc({ tool: spec.name, outcome: 'handler-error' }); + endTimer({ tool: spec.name, outcome: 'handler-error' }); return { isError: true, content: [{ type: 'text' as const, text: `Tool ${spec.name} failed: ${msg}` }], diff --git a/services/mana-mcp/src/metrics.ts b/services/mana-mcp/src/metrics.ts new file mode 100644 index 000000000..02101560f --- /dev/null +++ b/services/mana-mcp/src/metrics.ts @@ -0,0 +1,60 @@ +/** + * Prometheus metrics — exported on GET /metrics. + * + * Mirrors the shape of `services/mana-ai/src/metrics.ts` so Grafana and + * status.mana.how recognise this service without special-casing. Metric + * names use the `mana_mcp_*` prefix; labels stay low-cardinality on + * purpose (tool name is high-cardinality but still a fixed registry, so + * it's acceptable — we have ~20 tools today). + */ + +import { Counter, Histogram, Registry, collectDefaultMetrics } from 'prom-client'; + +export const register = new Registry(); +register.setDefaultLabels({ service: 'mana-mcp' }); +collectDefaultMetrics({ register, prefix: 'mana_mcp_' }); + +// ── Policy gate ────────────────────────────────────────────── + +/** + * One sample per `evaluatePolicy()` call. + * + * Labels: + * - `decision`: `allow` | `deny` | `flagged` (flagged = allow with a + * reminder, e.g. freetext injection marker hit) + * - `reason`: `admin-scope-not-invokable` | `destructive-not-allowed` + * | `rate-limit-exceeded` | `injection-marker` + * | `clean` (no reason applied; for dashboards) + * - `mode`: `log-only` | `enforce` — lets us diff how many + * decisions WOULD block vs. actually blocked during soak + */ +export const policyDecisionsTotal = new Counter({ + name: 'mana_mcp_policy_decisions_total', + help: 'Tool-policy gate decisions, bucketed by outcome and reason.', + labelNames: ['decision', 'reason', 'mode'] as const, + registers: [register], +}); + +// ── Tool invocations ───────────────────────────────────────── + +/** + * Every tool that makes it past the policy gate lands here. `outcome` + * is `success` | `handler-error` | `input-invalid` so dashboards can + * differentiate "tool ran but failed" from "LLM sent malformed args". + * Policy-denied calls are NOT counted here — they never reach the + * handler — and are visible under `policyDecisionsTotal{decision='deny'}`. + */ +export const toolInvocationsTotal = new Counter({ + name: 'mana_mcp_tool_invocations_total', + help: 'Tool handler invocations (after policy gate).', + labelNames: ['tool', 'outcome'] as const, + registers: [register], +}); + +export const toolDuration = new Histogram({ + name: 'mana_mcp_tool_duration_seconds', + help: 'Handler wall-clock latency per tool.', + labelNames: ['tool', 'outcome'] as const, + buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], + registers: [register], +});