feat(mana-ai): Prometheus metrics for tool-calls, loop rounds, provider errors

Three new counters + one histogram fill the observability gap from
the function-calling migration:

- mana_ai_tool_calls_total{tool, policy, outcome} — one tick per
  tool_call the planner produced. `outcome` is `deferred` on the
  server (stub onToolCall records for later client execution);
  webapp runner will emit success/failure once it grows its own
  Prom surface.
- mana_ai_planner_rounds (histogram, buckets 1..5) — distribution of
  rounds consumed per iteration. Runs close to the cap signal a
  planner struggling with the mission objective.
- mana_ai_provider_errors_total{provider, kind} — structured errors
  surfaced from mana-llm. Kind mirrors the ProviderError hierarchy
  added in commit 1 of the migration (blocked/truncated/auth/
  rate_limit/capability/unknown).

Plumbing:
- llm-client.ts parses mana-llm's `{detail: {kind, message}}` 4xx/5xx
  body shape and re-throws as ProviderCallError carrying the kind.
- tick.ts observes metrics at the natural emission points — rounds
  + per-call counter after runPlannerLoop returns, provider_errors
  in the catch block.

Grafana dashboards + status.mana.how already pick up the
collectDefaultMetrics prefix, so these metrics land in the existing
mana-ai panel without scraper changes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-20 20:48:29 +02:00
parent 3ac32d9f3e
commit 1d3794f96c
3 changed files with 125 additions and 3 deletions

View file

@ -28,7 +28,7 @@ import { listDueMissions, type ServerMission } from '../db/missions-projection';
import { loadActiveAgents, refreshAgentSnapshots, type ServerAgent } from '../db/agents-projection';
import { appendServerIteration, planToIteration } from '../db/iteration-writer';
import { refreshSnapshots } from '../db/snapshot-refresh';
import { createServerLlmClient } from '../planner/llm-client';
import { createServerLlmClient, ProviderCallError } from '../planner/llm-client';
import { SERVER_TOOLS } from '../planner/tools';
import {
ticksTotal,
@ -43,6 +43,9 @@ import {
grantSkipsTotal,
agentDecisionsTotal,
tokensUsedTotal,
toolCallsTotal,
plannerRoundsHistogram,
providerErrorsTotal,
} from '../metrics';
import { unwrapMissionGrant } from '../crypto/unwrap-grant';
import { NewsResearchClient } from '../planner/news-research-client';
@ -333,6 +336,21 @@ async function planOneMission(
}),
});
// Observability: one counter tick per tool_call + one histogram
// sample for round consumption. `policy` is pulled off the
// catalog entry so a later change to Gemini-default flipping
// auto→propose would show up in the labels without code changes.
plannerRoundsHistogram.observe(loopResult.rounds);
for (const ec of loopResult.executedCalls) {
const catalogEntry = SERVER_TOOLS.find((t) => t.name === ec.call.name);
const policy = catalogEntry?.defaultPolicy ?? 'propose';
// Server-side execution is always deferred to the client —
// the onToolCall stub returns success without running
// anything. Real execution metrics will come from the
// webapp runner once it emits its own Prom surface.
toolCallsTotal.inc({ tool: ec.call.name, policy, outcome: 'deferred' });
}
return {
plan: {
summary: loopResult.summary ?? '',
@ -347,11 +365,23 @@ async function planOneMission(
};
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
if (err instanceof ProviderCallError) {
const provider = inferProviderFromModel('google/gemini-2.5-flash');
providerErrorsTotal.inc({ provider, kind: err.kind });
}
console.warn(`[mana-ai tick] mission=${m.id} planner loop failed: ${msg}`);
return null;
}
}
/** Parse provider name off a `provider/model` string. Used purely for
* metric labelling falls back to `'unknown'` so a misconfigured
* model id doesn't crash the counter. */
function inferProviderFromModel(model: string): string {
const [provider] = model.split('/', 1);
return provider || 'unknown';
}
/**
* Drop tools the agent's policy denies so the Planner never sees a tool
* it can't use. `propose` and `auto` stay (but the server only hands the

View file

@ -154,3 +154,55 @@ export const tokensUsedTotal = new Counter({
labelNames: ['agent_id'] as const,
registers: [register],
});
// ── Function-Calling Planner (post-migration) ────────────
/**
* Per-tool outcome counter.
*
* `policy` is the catalog default (auto / propose) the server-side
* surface offers only propose-tools, so in practice this is always
* `propose`, but the label stays for forward-compatibility with
* a future web-runner integration.
*
* `outcome` values:
* - `success` the onToolCall callback returned `success: true`
* (used in environments that actually execute)
* - `failure` onToolCall returned `success: false`
* - `deferred` the server-side stub; the tool_call is recorded
* for client-side application on sync (the ONLY
* value the mana-ai tick emits today)
*/
export const toolCallsTotal = new Counter({
name: 'mana_ai_tool_calls_total',
help: 'Total tool_calls produced by the planner and handled.',
labelNames: ['tool', 'policy', 'outcome'] as const,
registers: [register],
});
/**
* Distribution of how many planner rounds a single iteration consumed.
* 1 = LLM went straight to a terminal answer; runs close to the hard
* cap (5) mean the planner is struggling. Buckets line up with the
* fixed 5-round ceiling so Grafana's heatmap is trivially readable.
*/
export const plannerRoundsHistogram = new Histogram({
name: 'mana_ai_planner_rounds',
help: 'Number of reasoning rounds consumed per iteration.',
buckets: [1, 2, 3, 4, 5],
registers: [register],
});
/**
* Structured provider errors returned from mana-llm. `kind` mirrors
* the ProviderError hierarchy in services/mana-llm/src/providers/errors.py
* (blocked / truncated / auth / rate_limit / capability / unknown).
* `provider` is inferred from the model id (google / openrouter /
* ollama / ).
*/
export const providerErrorsTotal = new Counter({
name: 'mana_ai_provider_errors_total',
help: 'Structured provider errors surfaced from mana-llm.',
labelNames: ['provider', 'kind'] as const,
registers: [register],
});

View file

@ -16,6 +16,20 @@ import type {
ToolCallRequest,
} from '@mana/shared-ai';
/** Thrown when mana-llm returns a non-2xx status. ``kind`` mirrors the
* structured ProviderError vocabulary (blocked / truncated / auth /
* rate_limit / capability / unknown) so downstream metrics can label
* without re-parsing the message. */
export class ProviderCallError extends Error {
constructor(
message: string,
public readonly kind: string
) {
super(message);
this.name = 'ProviderCallError';
}
}
export interface ServerLlmClientOptions {
readonly baseUrl: string;
readonly serviceKey: string;
@ -65,8 +79,34 @@ export function createServerLlmClient(opts: ServerLlmClientOptions): LlmClient {
clearTimeout(timeout);
if (!res.ok) {
const detail = await res.text().catch(() => '');
throw new Error(`mana-llm ${res.status}: ${detail.slice(0, 500)}`);
// mana-llm surfaces structured errors from the provider
// layer (see services/mana-llm/src/providers/errors.py):
// `{ detail: { kind, message } }` for 422 / 429 / 502 /
// 400, plain string detail for everything else. Preserve
// `kind` on the thrown error so callers (tick metrics)
// can label provider_errors_total without re-parsing.
let kind = 'unknown';
let message = `mana-llm ${res.status}`;
try {
const body = (await res.json()) as {
detail?: string | { kind?: string; message?: string };
};
if (typeof body.detail === 'string') {
message = `${message}: ${body.detail.slice(0, 500)}`;
} else if (body.detail && typeof body.detail === 'object') {
kind = body.detail.kind ?? 'unknown';
message = `${message} (${kind}): ${body.detail.message ?? ''}`;
}
} catch {
// body wasn't JSON — fall back to plain text
try {
const text = await res.text();
if (text) message = `${message}: ${text.slice(0, 500)}`;
} catch {
/* already exhausted body stream */
}
}
throw new ProviderCallError(message, kind);
}
const data = (await res.json()) as ChatCompletionResponseShape;