mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 21:21:10 +02:00
feat(mana-ai): Prometheus metrics for tool-calls, loop rounds, provider errors
Three new counters + one histogram fill the observability gap from
the function-calling migration:
- mana_ai_tool_calls_total{tool, policy, outcome} — one tick per
tool_call the planner produced. `outcome` is `deferred` on the
server (stub onToolCall records for later client execution);
webapp runner will emit success/failure once it grows its own
Prom surface.
- mana_ai_planner_rounds (histogram, buckets 1..5) — distribution of
rounds consumed per iteration. Runs close to the cap signal a
planner struggling with the mission objective.
- mana_ai_provider_errors_total{provider, kind} — structured errors
surfaced from mana-llm. Kind mirrors the ProviderError hierarchy
added in commit 1 of the migration (blocked/truncated/auth/
rate_limit/capability/unknown).
Plumbing:
- llm-client.ts parses mana-llm's `{detail: {kind, message}}` 4xx/5xx
body shape and re-throws as ProviderCallError carrying the kind.
- tick.ts observes metrics at the natural emission points — rounds
+ per-call counter after runPlannerLoop returns, provider_errors
in the catch block.
Grafana dashboards + status.mana.how already pick up the
collectDefaultMetrics prefix, so these metrics land in the existing
mana-ai panel without scraper changes.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3ac32d9f3e
commit
1d3794f96c
3 changed files with 125 additions and 3 deletions
|
|
@ -28,7 +28,7 @@ import { listDueMissions, type ServerMission } from '../db/missions-projection';
|
|||
import { loadActiveAgents, refreshAgentSnapshots, type ServerAgent } from '../db/agents-projection';
|
||||
import { appendServerIteration, planToIteration } from '../db/iteration-writer';
|
||||
import { refreshSnapshots } from '../db/snapshot-refresh';
|
||||
import { createServerLlmClient } from '../planner/llm-client';
|
||||
import { createServerLlmClient, ProviderCallError } from '../planner/llm-client';
|
||||
import { SERVER_TOOLS } from '../planner/tools';
|
||||
import {
|
||||
ticksTotal,
|
||||
|
|
@ -43,6 +43,9 @@ import {
|
|||
grantSkipsTotal,
|
||||
agentDecisionsTotal,
|
||||
tokensUsedTotal,
|
||||
toolCallsTotal,
|
||||
plannerRoundsHistogram,
|
||||
providerErrorsTotal,
|
||||
} from '../metrics';
|
||||
import { unwrapMissionGrant } from '../crypto/unwrap-grant';
|
||||
import { NewsResearchClient } from '../planner/news-research-client';
|
||||
|
|
@ -333,6 +336,21 @@ async function planOneMission(
|
|||
}),
|
||||
});
|
||||
|
||||
// Observability: one counter tick per tool_call + one histogram
|
||||
// sample for round consumption. `policy` is pulled off the
|
||||
// catalog entry so a later change to Gemini-default flipping
|
||||
// auto→propose would show up in the labels without code changes.
|
||||
plannerRoundsHistogram.observe(loopResult.rounds);
|
||||
for (const ec of loopResult.executedCalls) {
|
||||
const catalogEntry = SERVER_TOOLS.find((t) => t.name === ec.call.name);
|
||||
const policy = catalogEntry?.defaultPolicy ?? 'propose';
|
||||
// Server-side execution is always deferred to the client —
|
||||
// the onToolCall stub returns success without running
|
||||
// anything. Real execution metrics will come from the
|
||||
// webapp runner once it emits its own Prom surface.
|
||||
toolCallsTotal.inc({ tool: ec.call.name, policy, outcome: 'deferred' });
|
||||
}
|
||||
|
||||
return {
|
||||
plan: {
|
||||
summary: loopResult.summary ?? '',
|
||||
|
|
@ -347,11 +365,23 @@ async function planOneMission(
|
|||
};
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
if (err instanceof ProviderCallError) {
|
||||
const provider = inferProviderFromModel('google/gemini-2.5-flash');
|
||||
providerErrorsTotal.inc({ provider, kind: err.kind });
|
||||
}
|
||||
console.warn(`[mana-ai tick] mission=${m.id} planner loop failed: ${msg}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Parse provider name off a `provider/model` string. Used purely for
|
||||
* metric labelling — falls back to `'unknown'` so a misconfigured
|
||||
* model id doesn't crash the counter. */
|
||||
function inferProviderFromModel(model: string): string {
|
||||
const [provider] = model.split('/', 1);
|
||||
return provider || 'unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Drop tools the agent's policy denies so the Planner never sees a tool
|
||||
* it can't use. `propose` and `auto` stay (but the server only hands the
|
||||
|
|
|
|||
|
|
@ -154,3 +154,55 @@ export const tokensUsedTotal = new Counter({
|
|||
labelNames: ['agent_id'] as const,
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
// ── Function-Calling Planner (post-migration) ────────────
|
||||
|
||||
/**
|
||||
* Per-tool outcome counter.
|
||||
*
|
||||
* `policy` is the catalog default (auto / propose) — the server-side
|
||||
* surface offers only propose-tools, so in practice this is always
|
||||
* `propose`, but the label stays for forward-compatibility with
|
||||
* a future web-runner integration.
|
||||
*
|
||||
* `outcome` values:
|
||||
* - `success` — the onToolCall callback returned `success: true`
|
||||
* (used in environments that actually execute)
|
||||
* - `failure` — onToolCall returned `success: false`
|
||||
* - `deferred` — the server-side stub; the tool_call is recorded
|
||||
* for client-side application on sync (the ONLY
|
||||
* value the mana-ai tick emits today)
|
||||
*/
|
||||
export const toolCallsTotal = new Counter({
|
||||
name: 'mana_ai_tool_calls_total',
|
||||
help: 'Total tool_calls produced by the planner and handled.',
|
||||
labelNames: ['tool', 'policy', 'outcome'] as const,
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
/**
|
||||
* Distribution of how many planner rounds a single iteration consumed.
|
||||
* 1 = LLM went straight to a terminal answer; runs close to the hard
|
||||
* cap (5) mean the planner is struggling. Buckets line up with the
|
||||
* fixed 5-round ceiling so Grafana's heatmap is trivially readable.
|
||||
*/
|
||||
export const plannerRoundsHistogram = new Histogram({
|
||||
name: 'mana_ai_planner_rounds',
|
||||
help: 'Number of reasoning rounds consumed per iteration.',
|
||||
buckets: [1, 2, 3, 4, 5],
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
/**
|
||||
* Structured provider errors returned from mana-llm. `kind` mirrors
|
||||
* the ProviderError hierarchy in services/mana-llm/src/providers/errors.py
|
||||
* (blocked / truncated / auth / rate_limit / capability / unknown).
|
||||
* `provider` is inferred from the model id (google / openrouter /
|
||||
* ollama / …).
|
||||
*/
|
||||
export const providerErrorsTotal = new Counter({
|
||||
name: 'mana_ai_provider_errors_total',
|
||||
help: 'Structured provider errors surfaced from mana-llm.',
|
||||
labelNames: ['provider', 'kind'] as const,
|
||||
registers: [register],
|
||||
});
|
||||
|
|
|
|||
|
|
@ -16,6 +16,20 @@ import type {
|
|||
ToolCallRequest,
|
||||
} from '@mana/shared-ai';
|
||||
|
||||
/** Thrown when mana-llm returns a non-2xx status. ``kind`` mirrors the
|
||||
* structured ProviderError vocabulary (blocked / truncated / auth /
|
||||
* rate_limit / capability / unknown) so downstream metrics can label
|
||||
* without re-parsing the message. */
|
||||
export class ProviderCallError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly kind: string
|
||||
) {
|
||||
super(message);
|
||||
this.name = 'ProviderCallError';
|
||||
}
|
||||
}
|
||||
|
||||
export interface ServerLlmClientOptions {
|
||||
readonly baseUrl: string;
|
||||
readonly serviceKey: string;
|
||||
|
|
@ -65,8 +79,34 @@ export function createServerLlmClient(opts: ServerLlmClientOptions): LlmClient {
|
|||
clearTimeout(timeout);
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await res.text().catch(() => '');
|
||||
throw new Error(`mana-llm ${res.status}: ${detail.slice(0, 500)}`);
|
||||
// mana-llm surfaces structured errors from the provider
|
||||
// layer (see services/mana-llm/src/providers/errors.py):
|
||||
// `{ detail: { kind, message } }` for 422 / 429 / 502 /
|
||||
// 400, plain string detail for everything else. Preserve
|
||||
// `kind` on the thrown error so callers (tick metrics)
|
||||
// can label provider_errors_total without re-parsing.
|
||||
let kind = 'unknown';
|
||||
let message = `mana-llm ${res.status}`;
|
||||
try {
|
||||
const body = (await res.json()) as {
|
||||
detail?: string | { kind?: string; message?: string };
|
||||
};
|
||||
if (typeof body.detail === 'string') {
|
||||
message = `${message}: ${body.detail.slice(0, 500)}`;
|
||||
} else if (body.detail && typeof body.detail === 'object') {
|
||||
kind = body.detail.kind ?? 'unknown';
|
||||
message = `${message} (${kind}): ${body.detail.message ?? ''}`;
|
||||
}
|
||||
} catch {
|
||||
// body wasn't JSON — fall back to plain text
|
||||
try {
|
||||
const text = await res.text();
|
||||
if (text) message = `${message}: ${text.slice(0, 500)}`;
|
||||
} catch {
|
||||
/* already exhausted body stream */
|
||||
}
|
||||
}
|
||||
throw new ProviderCallError(message, kind);
|
||||
}
|
||||
|
||||
const data = (await res.json()) as ChatCompletionResponseShape;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue