mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 22:41:09 +02:00
feat(mana-mcp): Prometheus metrics for policy gate + tool invocations
Replaces the stub /metrics endpoint with a real prom-client registry
(mana_mcp_ prefix, {service="mana-mcp"} default label). Default
process metrics come along for free.
Policy-gate telemetry is the whole point — without it we can't soak
POLICY_MODE=log-only safely or decide when to flip to enforce. New
counter mana_mcp_policy_decisions_total{decision, reason, mode} buckets
every evaluatePolicy() call:
decision ∈ {allow, deny, flagged}
reason ∈ {admin-scope-not-invokable, destructive-not-allowed,
rate-limit-exceeded, injection-marker, clean, unknown}
mode ∈ {log-only, enforce}
So the rate of "would have been denied" during soak is visible directly
as policy_decisions_total{decision="deny", mode="log-only"}.
Also:
- mana_mcp_tool_invocations_total{tool, outcome} — success |
handler-error | input-invalid. Policy denies are NOT counted here
(they're in policy_decisions_total above); this counter only counts
calls that actually reached the handler or tripped zod validation.
- mana_mcp_tool_duration_seconds histogram per tool/outcome.
Dep: prom-client ^15.1.3 (same version mana-ai pins).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f07eae3c01
commit
c94ab01c69
4 changed files with 89 additions and 3 deletions
|
|
@ -15,6 +15,7 @@
|
||||||
"@modelcontextprotocol/sdk": "^1.29.0",
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
||||||
"hono": "^4.7.0",
|
"hono": "^4.7.0",
|
||||||
"jose": "^6.1.2",
|
"jose": "^6.1.2",
|
||||||
|
"prom-client": "^15.1.3",
|
||||||
"zod": "^3.25.76"
|
"zod": "^3.25.76"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ import { registerAllModules } from '@mana/tool-registry';
|
||||||
import { loadConfig } from './config.ts';
|
import { loadConfig } from './config.ts';
|
||||||
import { authenticateRequest, UnauthorizedError } from './auth.ts';
|
import { authenticateRequest, UnauthorizedError } from './auth.ts';
|
||||||
import { handleMcpRequest } from './transport.ts';
|
import { handleMcpRequest } from './transport.ts';
|
||||||
|
import { register as metricsRegistry } from './metrics.ts';
|
||||||
|
|
||||||
// ─── Bootstrap ────────────────────────────────────────────────────
|
// ─── Bootstrap ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
@ -42,9 +43,10 @@ app.get('/health', (c) =>
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
app.get('/metrics', (c) =>
|
app.get('/metrics', async (c) => {
|
||||||
c.text('# mana-mcp metrics stub — populated alongside Persona-Runner observability\n')
|
const body = await metricsRegistry.metrics();
|
||||||
);
|
return c.text(body, 200, { 'content-type': metricsRegistry.contentType });
|
||||||
|
});
|
||||||
|
|
||||||
// ─── MCP endpoint ─────────────────────────────────────────────────
|
// ─── MCP endpoint ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@ import {
|
||||||
import type { VerifiedUser } from './auth.ts';
|
import type { VerifiedUser } from './auth.ts';
|
||||||
import type { Config } from './config.ts';
|
import type { Config } from './config.ts';
|
||||||
import { appendInvocation, getRecentInvocations } from './invocation-log.ts';
|
import { appendInvocation, getRecentInvocations } from './invocation-log.ts';
|
||||||
|
import { policyDecisionsTotal, toolDuration, toolInvocationsTotal } from './metrics.ts';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Shared across all sessions — the client caches MKs per userId with a
|
* Shared across all sessions — the client caches MKs per userId with a
|
||||||
|
|
@ -109,6 +110,7 @@ export function createMcpServerForUser(user: VerifiedUser, config: Config): McpS
|
||||||
err instanceof z.ZodError
|
err instanceof z.ZodError
|
||||||
? err.issues.map((i) => `${i.path.join('.')}: ${i.message}`).join('; ')
|
? err.issues.map((i) => `${i.path.join('.')}: ${i.message}`).join('; ')
|
||||||
: String(err);
|
: String(err);
|
||||||
|
toolInvocationsTotal.inc({ tool: spec.name, outcome: 'input-invalid' });
|
||||||
return {
|
return {
|
||||||
isError: true,
|
isError: true,
|
||||||
content: [{ type: 'text' as const, text: `Invalid input for ${spec.name}: ${msg}` }],
|
content: [{ type: 'text' as const, text: `Invalid input for ${spec.name}: ${msg}` }],
|
||||||
|
|
@ -129,6 +131,11 @@ export function createMcpServerForUser(user: VerifiedUser, config: Config): McpS
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!decision.allow) {
|
if (!decision.allow) {
|
||||||
|
policyDecisionsTotal.inc({
|
||||||
|
decision: 'deny',
|
||||||
|
reason: decision.reason ?? 'unknown',
|
||||||
|
mode: config.policyMode,
|
||||||
|
});
|
||||||
const label = config.policyMode === 'enforce' ? 'DENY' : 'WOULD-DENY';
|
const label = config.policyMode === 'enforce' ? 'DENY' : 'WOULD-DENY';
|
||||||
console.warn(
|
console.warn(
|
||||||
`[mana-mcp policy] ${label} tool=${spec.name} user=${user.userId.slice(0, 8)} reason=${decision.reason}`
|
`[mana-mcp policy] ${label} tool=${spec.name} user=${user.userId.slice(0, 8)} reason=${decision.reason}`
|
||||||
|
|
@ -143,7 +150,18 @@ export function createMcpServerForUser(user: VerifiedUser, config: Config): McpS
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
} else if (decision.reminder) {
|
} else if (decision.reminder) {
|
||||||
|
policyDecisionsTotal.inc({
|
||||||
|
decision: 'flagged',
|
||||||
|
reason: 'injection-marker',
|
||||||
|
mode: config.policyMode,
|
||||||
|
});
|
||||||
console.info(`[mana-mcp policy] FLAG tool=${spec.name} user=${user.userId.slice(0, 8)}`);
|
console.info(`[mana-mcp policy] FLAG tool=${spec.name} user=${user.userId.slice(0, 8)}`);
|
||||||
|
} else {
|
||||||
|
policyDecisionsTotal.inc({
|
||||||
|
decision: 'allow',
|
||||||
|
reason: 'clean',
|
||||||
|
mode: config.policyMode,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -151,13 +169,18 @@ export function createMcpServerForUser(user: VerifiedUser, config: Config): McpS
|
||||||
// handler's duration doesn't open a rate-limit gap.
|
// handler's duration doesn't open a rate-limit gap.
|
||||||
appendInvocation(user.userId, spec.name);
|
appendInvocation(user.userId, spec.name);
|
||||||
|
|
||||||
|
const endTimer = toolDuration.startTimer({ tool: spec.name, outcome: 'success' });
|
||||||
try {
|
try {
|
||||||
const result = await spec.handler(parsed, ctxFor(spec.name));
|
const result = await spec.handler(parsed, ctxFor(spec.name));
|
||||||
|
toolInvocationsTotal.inc({ tool: spec.name, outcome: 'success' });
|
||||||
|
endTimer({ tool: spec.name, outcome: 'success' });
|
||||||
return {
|
return {
|
||||||
content: [{ type: 'text' as const, text: JSON.stringify(result, null, 2) }],
|
content: [{ type: 'text' as const, text: JSON.stringify(result, null, 2) }],
|
||||||
};
|
};
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
const msg = err instanceof Error ? err.message : String(err);
|
const msg = err instanceof Error ? err.message : String(err);
|
||||||
|
toolInvocationsTotal.inc({ tool: spec.name, outcome: 'handler-error' });
|
||||||
|
endTimer({ tool: spec.name, outcome: 'handler-error' });
|
||||||
return {
|
return {
|
||||||
isError: true,
|
isError: true,
|
||||||
content: [{ type: 'text' as const, text: `Tool ${spec.name} failed: ${msg}` }],
|
content: [{ type: 'text' as const, text: `Tool ${spec.name} failed: ${msg}` }],
|
||||||
|
|
|
||||||
60
services/mana-mcp/src/metrics.ts
Normal file
60
services/mana-mcp/src/metrics.ts
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
/**
|
||||||
|
* Prometheus metrics — exported on GET /metrics.
|
||||||
|
*
|
||||||
|
* Mirrors the shape of `services/mana-ai/src/metrics.ts` so Grafana and
|
||||||
|
* status.mana.how recognise this service without special-casing. Metric
|
||||||
|
* names use the `mana_mcp_*` prefix; labels stay low-cardinality on
|
||||||
|
* purpose (tool name is high-cardinality but still a fixed registry, so
|
||||||
|
* it's acceptable — we have ~20 tools today).
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Counter, Histogram, Registry, collectDefaultMetrics } from 'prom-client';
|
||||||
|
|
||||||
|
export const register = new Registry();
|
||||||
|
register.setDefaultLabels({ service: 'mana-mcp' });
|
||||||
|
collectDefaultMetrics({ register, prefix: 'mana_mcp_' });
|
||||||
|
|
||||||
|
// ── Policy gate ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* One sample per `evaluatePolicy()` call.
|
||||||
|
*
|
||||||
|
* Labels:
|
||||||
|
* - `decision`: `allow` | `deny` | `flagged` (flagged = allow with a
|
||||||
|
* reminder, e.g. freetext injection marker hit)
|
||||||
|
* - `reason`: `admin-scope-not-invokable` | `destructive-not-allowed`
|
||||||
|
* | `rate-limit-exceeded` | `injection-marker`
|
||||||
|
* | `clean` (no reason applied; for dashboards)
|
||||||
|
* - `mode`: `log-only` | `enforce` — lets us diff how many
|
||||||
|
* decisions WOULD block vs. actually blocked during soak
|
||||||
|
*/
|
||||||
|
export const policyDecisionsTotal = new Counter({
|
||||||
|
name: 'mana_mcp_policy_decisions_total',
|
||||||
|
help: 'Tool-policy gate decisions, bucketed by outcome and reason.',
|
||||||
|
labelNames: ['decision', 'reason', 'mode'] as const,
|
||||||
|
registers: [register],
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Tool invocations ─────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Every tool that makes it past the policy gate lands here. `outcome`
|
||||||
|
* is `success` | `handler-error` | `input-invalid` so dashboards can
|
||||||
|
* differentiate "tool ran but failed" from "LLM sent malformed args".
|
||||||
|
* Policy-denied calls are NOT counted here — they never reach the
|
||||||
|
* handler — and are visible under `policyDecisionsTotal{decision='deny'}`.
|
||||||
|
*/
|
||||||
|
export const toolInvocationsTotal = new Counter({
|
||||||
|
name: 'mana_mcp_tool_invocations_total',
|
||||||
|
help: 'Tool handler invocations (after policy gate).',
|
||||||
|
labelNames: ['tool', 'outcome'] as const,
|
||||||
|
registers: [register],
|
||||||
|
});
|
||||||
|
|
||||||
|
export const toolDuration = new Histogram({
|
||||||
|
name: 'mana_mcp_tool_duration_seconds',
|
||||||
|
help: 'Handler wall-clock latency per tool.',
|
||||||
|
labelNames: ['tool', 'outcome'] as const,
|
||||||
|
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
|
||||||
|
registers: [register],
|
||||||
|
});
|
||||||
Loading…
Add table
Add a link
Reference in a new issue