feat(mana-ai): Prometheus metrics for tool-calls, loop rounds, provider errors

Three new counters + one histogram fill the observability gap from the function-calling migration: - mana_ai_tool_calls_total{tool, policy, outcome} — one tick per tool_call the planner produced. `outcome` is `deferred` on the server (stub onToolCall records for later client execution); webapp runner will emit success/failure once it grows its own Prom surface. - mana_ai_planner_rounds (histogram, buckets 1..5) — distribution of rounds consumed per iteration. Runs close to the cap signal a planner struggling with the mission objective. - mana_ai_provider_errors_total{provider, kind} — structured errors surfaced from mana-llm. Kind mirrors the ProviderError hierarchy added in commit 1 of the migration (blocked/truncated/auth/ rate_limit/capability/unknown). Plumbing: - llm-client.ts parses mana-llm's `{detail: {kind, message}}` 4xx/5xx body shape and re-throws as ProviderCallError carrying the kind. - tick.ts observes metrics at the natural emission points — rounds + per-call counter after runPlannerLoop returns, provider_errors in the catch block. Grafana dashboards + status.mana.how already pick up the collectDefaultMetrics prefix, so these metrics land in the existing mana-ai panel without scraper changes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 21:21:10 +02:00 · 2026-04-20 20:48:29 +02:00 · 2026-04-20 20:48:29 +02:00 · 1d3794f96c
commit 1d3794f96c
parent 3ac32d9f3e
3 changed files with 125 additions and 3 deletions
--- a/services/mana-ai/src/cron/tick.ts
+++ b/services/mana-ai/src/cron/tick.ts
@ -28,7 +28,7 @@ import { listDueMissions, type ServerMission } from '../db/missions-projection';
 import { loadActiveAgents, refreshAgentSnapshots, type ServerAgent } from '../db/agents-projection';
 import { appendServerIteration, planToIteration } from '../db/iteration-writer';
 import { refreshSnapshots } from '../db/snapshot-refresh';
-import { createServerLlmClient } from '../planner/llm-client';
+import { createServerLlmClient, ProviderCallError } from '../planner/llm-client';
 import { SERVER_TOOLS } from '../planner/tools';
 import {
 	ticksTotal,
@ -43,6 +43,9 @@ import {
 	grantSkipsTotal,
 	agentDecisionsTotal,
 	tokensUsedTotal,
+	toolCallsTotal,
+	plannerRoundsHistogram,
+	providerErrorsTotal,
 } from '../metrics';
 import { unwrapMissionGrant } from '../crypto/unwrap-grant';
 import { NewsResearchClient } from '../planner/news-research-client';
@ -333,6 +336,21 @@ async function planOneMission(
 			}),
 		});

+		// Observability: one counter tick per tool_call + one histogram
+		// sample for round consumption. `policy` is pulled off the
+		// catalog entry so a later change to Gemini-default flipping
+		// auto→propose would show up in the labels without code changes.
+		plannerRoundsHistogram.observe(loopResult.rounds);
+		for (const ec of loopResult.executedCalls) {
+			const catalogEntry = SERVER_TOOLS.find((t) => t.name === ec.call.name);
+			const policy = catalogEntry?.defaultPolicy ?? 'propose';
+			// Server-side execution is always deferred to the client —
+			// the onToolCall stub returns success without running
+			// anything. Real execution metrics will come from the
+			// webapp runner once it emits its own Prom surface.
+			toolCallsTotal.inc({ tool: ec.call.name, policy, outcome: 'deferred' });
+		}
+
 		return {
 			plan: {
 				summary: loopResult.summary ?? '',
@ -347,11 +365,23 @@ async function planOneMission(
 		};
 	} catch (err) {
 		const msg = err instanceof Error ? err.message : String(err);
+		if (err instanceof ProviderCallError) {
+			const provider = inferProviderFromModel('google/gemini-2.5-flash');
+			providerErrorsTotal.inc({ provider, kind: err.kind });
+		}
 		console.warn(`[mana-ai tick] mission=${m.id} planner loop failed: ${msg}`);
 		return null;
 	}
 }

+/** Parse provider name off a `provider/model` string. Used purely for
+ *  metric labelling — falls back to `'unknown'` so a misconfigured
+ *  model id doesn't crash the counter. */
+function inferProviderFromModel(model: string): string {
+	const [provider] = model.split('/', 1);
+	return provider || 'unknown';
+}
+
 /**
 * Drop tools the agent's policy denies so the Planner never sees a tool
 * it can't use. `propose` and `auto` stay (but the server only hands the
--- a/services/mana-ai/src/metrics.ts
+++ b/services/mana-ai/src/metrics.ts
@ -154,3 +154,55 @@ export const tokensUsedTotal = new Counter({
 	labelNames: ['agent_id'] as const,
 	registers: [register],
 });
+
+// ── Function-Calling Planner (post-migration) ────────────
+
+/**
+ * Per-tool outcome counter.
+ *
+ * `policy` is the catalog default (auto / propose) — the server-side
+ * surface offers only propose-tools, so in practice this is always
+ * `propose`, but the label stays for forward-compatibility with
+ * a future web-runner integration.
+ *
+ * `outcome` values:
+ *   - `success`  — the onToolCall callback returned `success: true`
+ *                  (used in environments that actually execute)
+ *   - `failure`  — onToolCall returned `success: false`
+ *   - `deferred` — the server-side stub; the tool_call is recorded
+ *                  for client-side application on sync (the ONLY
+ *                  value the mana-ai tick emits today)
+ */
+export const toolCallsTotal = new Counter({
+	name: 'mana_ai_tool_calls_total',
+	help: 'Total tool_calls produced by the planner and handled.',
+	labelNames: ['tool', 'policy', 'outcome'] as const,
+	registers: [register],
+});
+
+/**
+ * Distribution of how many planner rounds a single iteration consumed.
+ * 1 = LLM went straight to a terminal answer; runs close to the hard
+ * cap (5) mean the planner is struggling. Buckets line up with the
+ * fixed 5-round ceiling so Grafana's heatmap is trivially readable.
+ */
+export const plannerRoundsHistogram = new Histogram({
+	name: 'mana_ai_planner_rounds',
+	help: 'Number of reasoning rounds consumed per iteration.',
+	buckets: [1, 2, 3, 4, 5],
+	registers: [register],
+});
+
+/**
+ * Structured provider errors returned from mana-llm. `kind` mirrors
+ * the ProviderError hierarchy in services/mana-llm/src/providers/errors.py
+ * (blocked / truncated / auth / rate_limit / capability / unknown).
+ * `provider` is inferred from the model id (google / openrouter /
+ * ollama / …).
+ */
+export const providerErrorsTotal = new Counter({
+	name: 'mana_ai_provider_errors_total',
+	help: 'Structured provider errors surfaced from mana-llm.',
+	labelNames: ['provider', 'kind'] as const,
+	registers: [register],
+});
--- a/services/mana-ai/src/planner/llm-client.ts
+++ b/services/mana-ai/src/planner/llm-client.ts
@ -16,6 +16,20 @@ import type {
 	ToolCallRequest,
 } from '@mana/shared-ai';

+/** Thrown when mana-llm returns a non-2xx status. ``kind`` mirrors the
+ *  structured ProviderError vocabulary (blocked / truncated / auth /
+ *  rate_limit / capability / unknown) so downstream metrics can label
+ *  without re-parsing the message. */
+export class ProviderCallError extends Error {
+	constructor(
+		message: string,
+		public readonly kind: string
+	) {
+		super(message);
+		this.name = 'ProviderCallError';
+	}
+}
+
 export interface ServerLlmClientOptions {
 	readonly baseUrl: string;
 	readonly serviceKey: string;
@ -65,8 +79,34 @@ export function createServerLlmClient(opts: ServerLlmClientOptions): LlmClient {
 			clearTimeout(timeout);

 			if (!res.ok) {
-				const detail = await res.text().catch(() => '');
-				throw new Error(`mana-llm ${res.status}: ${detail.slice(0, 500)}`);
+				// mana-llm surfaces structured errors from the provider
+				// layer (see services/mana-llm/src/providers/errors.py):
+				// `{ detail: { kind, message } }` for 422 / 429 / 502 /
+				// 400, plain string detail for everything else. Preserve
+				// `kind` on the thrown error so callers (tick metrics)
+				// can label provider_errors_total without re-parsing.
+				let kind = 'unknown';
+				let message = `mana-llm ${res.status}`;
+				try {
+					const body = (await res.json()) as {
+						detail?: string | { kind?: string; message?: string };
+					};
+					if (typeof body.detail === 'string') {
+						message = `${message}: ${body.detail.slice(0, 500)}`;
+					} else if (body.detail && typeof body.detail === 'object') {
+						kind = body.detail.kind ?? 'unknown';
+						message = `${message} (${kind}): ${body.detail.message ?? ''}`;
+					}
+				} catch {
+					// body wasn't JSON — fall back to plain text
+					try {
+						const text = await res.text();
+						if (text) message = `${message}: ${text.slice(0, 500)}`;
+					} catch {
+						/* already exhausted body stream */
+					}
+				}
+				throw new ProviderCallError(message, kind);
 			}

 			const data = (await res.json()) as ChatCompletionResponseShape;