mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-21 03:06:42 +02:00
Third phase of the Multi-Agent Workbench. The background mission
runner now respects the owning Agent: agent state gates whether
a mission runs, concurrency is capped per-agent, and server-produced
iterations carry the agent's identity as their Actor.
Data layer:
- db/migrate.ts: new mana_ai.agent_snapshots table (mirrors
mission_snapshots) with indexes on (user_id, last_applied_at) and
a partial index on active agents.
- db/agents-projection.ts: refreshAgentSnapshots (incremental LWW
replay over sync_changes appId='ai' table='agents') +
loadActiveAgents / loadAgent helpers. mergeRaw exported for tests.
- db/missions-projection.ts: ServerMission.agentId + projection
reads the JSONB field (undefined for legacy missions).
Tick integration (cron/tick.ts):
- Refreshes both snapshot tables on every pass (parallel).
- Per-user in-tick agent cache (Map<userId, Map<agentId, Agent>>)
so N missions for one user hit the DB once.
- Gate order: agent archived → skip silently; agent paused → skip;
per-agent maxConcurrentMissions exhausted this tick → defer to next.
All skip paths bump mana_ai_agent_decisions_total{decision}.
- Prompt injection: withAgentContext prepends an <agent_context>
block to the system prompt with the agent's name + role, and
plaintext systemPrompt + memory when available. Ciphertext
(enc:1:… blobs) are skipped — server has no key by design. Mirrors
the Mission Grant privacy stance: encrypted context belongs to the
foreground runner.
Iteration writer (db/iteration-writer.ts):
- New optional `agent` + `iterationId` + `rationale` inputs.
- When agent is present, the sync_changes row is stamped with a
makeAgentActor actor (principalId=agentId, displayName=agent.name)
so the webapp timeline groups the write under the right agent.
- Falls back to an AI actor with LEGACY_AI_PRINCIPAL + 'Mana' when
the mission has no owning agent; ultimate fallback to the
mission-runner system actor when iterationId is also missing.
Metrics:
- mana_ai_agent_decisions_total{decision=ran|skipped-paused|
skipped-archived|skipped-concurrency}. Missions without an agent
don't produce this metric — plansWrittenBackTotal is the universal
"did we run" counter.
Tests: 41/41 (was 35) including 6 new cases for the agent LWW merge.
mana-ai type-check clean. Webapp svelte-check: 0 errors (4 unrelated
warnings in a different module).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
147 lines
5.3 KiB
TypeScript
147 lines
5.3 KiB
TypeScript
/**
|
|
* Prometheus metrics — exported on GET /metrics.
|
|
*
|
|
* Follows the same shape as mana-media (default metrics with a service
|
|
* prefix, plus a handful of service-specific counters + histograms) so
|
|
* the existing Grafana dashboards and the status.mana.how generator
|
|
* recognise this service without special-casing.
|
|
*
|
|
* Metric naming: `mana_ai_*`. Underscore separator keeps Prometheus's
|
|
* standard-compliant regex `[a-zA-Z_:][a-zA-Z0-9_:]*` happy.
|
|
*/
|
|
|
|
import { Counter, Histogram, Registry, collectDefaultMetrics } from 'prom-client';
|
|
|
|
export const register = new Registry();
|
|
register.setDefaultLabels({ service: 'mana-ai' });
|
|
collectDefaultMetrics({ register, prefix: 'mana_ai_' });
|
|
|
|
// ── HTTP surface ──────────────────────────────────────────
|
|
|
|
export const httpRequestsTotal = new Counter({
|
|
name: 'mana_ai_http_requests_total',
|
|
help: 'Total HTTP requests received.',
|
|
labelNames: ['method', 'path', 'status'] as const,
|
|
registers: [register],
|
|
});
|
|
|
|
export const httpRequestDuration = new Histogram({
|
|
name: 'mana_ai_http_request_duration_seconds',
|
|
help: 'Latency per HTTP request.',
|
|
labelNames: ['method', 'path', 'status'] as const,
|
|
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
|
|
registers: [register],
|
|
});
|
|
|
|
// ── Mission runner — service-specific ─────────────────────
|
|
|
|
export const ticksTotal = new Counter({
|
|
name: 'mana_ai_ticks_total',
|
|
help: 'Total tick loop runs (all completions, including empty ones).',
|
|
registers: [register],
|
|
});
|
|
|
|
export const tickDuration = new Histogram({
|
|
name: 'mana_ai_tick_duration_seconds',
|
|
help: 'Wall-clock time spent inside one tick pass.',
|
|
buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120],
|
|
registers: [register],
|
|
});
|
|
|
|
export const plansProducedTotal = new Counter({
|
|
name: 'mana_ai_plans_produced_total',
|
|
help: 'Total plans the Planner returned parseable output for.',
|
|
registers: [register],
|
|
});
|
|
|
|
export const plansWrittenBackTotal = new Counter({
|
|
name: 'mana_ai_plans_written_back_total',
|
|
help: 'Total plans persisted as server iterations on aiMissions.',
|
|
registers: [register],
|
|
});
|
|
|
|
export const parseFailuresTotal = new Counter({
|
|
name: 'mana_ai_parse_failures_total',
|
|
help: 'Planner responses that failed JSON / shape validation.',
|
|
registers: [register],
|
|
});
|
|
|
|
export const missionErrorsTotal = new Counter({
|
|
name: 'mana_ai_mission_errors_total',
|
|
help: 'Errors thrown while processing a single mission within a tick.',
|
|
registers: [register],
|
|
});
|
|
|
|
export const plannerLatency = new Histogram({
|
|
name: 'mana_ai_planner_request_duration_seconds',
|
|
help: 'Latency of calls to the mana-llm backend.',
|
|
buckets: [0.25, 0.5, 1, 2, 5, 10, 30, 60],
|
|
registers: [register],
|
|
});
|
|
|
|
// ── Snapshot refresh ──────────────────────────────────────
|
|
|
|
export const snapshotsNewTotal = new Counter({
|
|
name: 'mana_ai_snapshots_new_total',
|
|
help: 'Mission-snapshot rows created on first sighting.',
|
|
registers: [register],
|
|
});
|
|
|
|
export const snapshotsUpdatedTotal = new Counter({
|
|
name: 'mana_ai_snapshots_updated_total',
|
|
help: 'Mission-snapshot rows updated with a delta.',
|
|
registers: [register],
|
|
});
|
|
|
|
export const snapshotRowsAppliedTotal = new Counter({
|
|
name: 'mana_ai_snapshot_rows_applied_total',
|
|
help: 'Sync-changes rows folded into the snapshot cache.',
|
|
registers: [register],
|
|
});
|
|
|
|
// ── Mission Key-Grant (Phase 2+) ──────────────────────────
|
|
|
|
export const decryptsTotal = new Counter({
|
|
name: 'mana_ai_decrypts_total',
|
|
help: 'Server-side field decrypts performed under a Mission grant.',
|
|
labelNames: ['table'] as const,
|
|
registers: [register],
|
|
});
|
|
|
|
/** Must remain at 0 in steady state — any increment indicates a record
|
|
* was requested outside the grant's allowlist. Alert on > 0. */
|
|
export const grantScopeViolationsTotal = new Counter({
|
|
name: 'mana_ai_grant_scope_violations_total',
|
|
help: 'Decrypt attempts rejected because the record was not on the grant allowlist.',
|
|
labelNames: ['table'] as const,
|
|
registers: [register],
|
|
});
|
|
|
|
export const grantSkipsTotal = new Counter({
|
|
name: 'mana_ai_grant_skips_total',
|
|
help: 'Missions skipped because their grant was missing, expired, or unwrappable.',
|
|
labelNames: ['reason'] as const,
|
|
registers: [register],
|
|
});
|
|
|
|
// ── Multi-Agent Workbench (Phase 3) ───────────────────────
|
|
|
|
/**
|
|
* Per-mission decision the tick took with respect to the owning agent.
|
|
* Possible `decision` values:
|
|
* - `ran` — mission processed normally under the agent
|
|
* - `skipped-paused` — agent.state === 'paused'
|
|
* - `skipped-archived` — agent.state === 'archived'
|
|
* - `skipped-concurrency` — agent's maxConcurrentMissions already hit
|
|
* this tick; retried next tick
|
|
*
|
|
* Missions without an owning agent (legacy, pre-Phase-2) don't produce
|
|
* this metric — that's why `mana_ai_plans_written_back_total` stays
|
|
* the ground-truth "did we run" counter.
|
|
*/
|
|
export const agentDecisionsTotal = new Counter({
|
|
name: 'mana_ai_agent_decisions_total',
|
|
help: 'Per-mission decision the tick made against the owning Agent.',
|
|
labelNames: ['decision'] as const,
|
|
registers: [register],
|
|
});
|