feat(mana-ai): Prometheus /metrics endpoint + status.mana.how integration

Wires mana-ai into the existing observability stack so tick throughput, plan-failure rates, planner latencies, and snapshot refresh health are visible in Grafana + Prometheus, and the service's uptime surfaces on status.mana.how under the "Internal" section. - `src/metrics.ts` — prom-client Registry with `mana_ai_` prefix. Counters: ticks_total, plans_produced_total, plans_written_back_total, parse_failures_total, mission_errors_total, snapshots_new/updated, snapshot_rows_applied_total, http_requests_total. Histograms: tick_duration_seconds (0.1–120s), planner_request_ duration_seconds (0.25–60s), http_request_duration_seconds (0.005–10s). - `src/index.ts` — HTTP middleware labels every request by method/path/status; `/metrics` serves the Prometheus text format. - `src/cron/tick.ts` — increments counters + wraps the tick with `tickDuration.startTimer()`. Snapshot stats fold through. - `src/planner/client.ts` — wraps `complete()` in a latency histogram timer so planner tail latency shows up separately from tick duration. - `docker/prometheus/prometheus.yml` — 1. New `mana-ai` scrape job against `mana-ai:3066/metrics` (30s). 2. `/health` added to the `blackbox-internal` job so uptime shows on status.mana.how alongside mana-geocoding. - `scripts/generate-status-page.sh` — friendly label for the new probe: `mana-ai:3066/health` → "Mana AI Runner" (generator already iterates `blackbox-internal`, no other changes needed). - `package.json` — prom-client ^15.1.3 All 17 Bun tests still pass; tsc clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 20:01:09 +02:00 · 2026-04-15 01:41:40 +02:00 · 2026-04-15 01:41:40 +02:00 · 0bf01f434e
commit 0bf01f434e
parent 767b64cdd4
9 changed files with 184 additions and 3 deletions
--- a/docker/prometheus/prometheus.yml
+++ b/docker/prometheus/prometheus.yml
@ -123,6 +123,15 @@ scrape_configs:
    metrics_path: '/metrics'
    scrape_interval: 30s

+  # Mana AI Service (Bun) — background Mission Runner for the AI Workbench.
+  # Exposes tick stats, planner-request latencies, snapshot refresh
+  # counters, and standard HTTP metrics at /metrics.
+  - job_name: 'mana-ai'
+    static_configs:
+      - targets: ['mana-ai:3066']
+    metrics_path: '/metrics'
+    scrape_interval: 30s
+
  # ============================================
  # GPU Server (Windows PC, LAN: 192.168.178.11)
  # ============================================
@ -297,6 +306,8 @@ scrape_configs:
          # Upstream Pelias health, proxied through the wrapper so the
          # blackbox-exporter doesn't need host.docker.internal access.
          - http://mana-geocoding:3018/health/pelias
+          # mana-ai (Mission Runner) — internal-only, no CF tunnel.
+          - http://mana-ai:3066/health
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -3297,6 +3297,9 @@ importers:
      postgres:
        specifier: ^3.4.5
        version: 3.4.9
+      prom-client:
+        specifier: ^15.1.3
+        version: 15.1.3
    devDependencies:
      '@types/bun':
        specifier: latest
--- a/scripts/generate-status-page.sh
+++ b/scripts/generate-status-page.sh
@ -75,6 +75,9 @@ friendly_name() {
    mana-geocoding:*)
      name="Mana Geocoding"
      ;;
+    mana-ai:*)
+      name="Mana AI Runner"
+      ;;
    mana.how/*)
      name="${name#mana.how/}"
      ;;
--- a/services/mana-ai/CLAUDE.md
+++ b/services/mana-ai/CLAUDE.md
@ -19,11 +19,20 @@ What works end-to-end:
 - [x] Webapp staging effect (`server-iteration-staging.ts`) picks up the synced iteration and translates each PlanStep into a local Proposal with full AI-actor attribution (missionId + iterationId + rationale). Idempotent via durable `proposalId` markers.
 - [x] **Server-side input resolvers** for plaintext tables — `db/resolvers/` with a pluggable registry + single-record LWW replay (`record-replay.ts`). `goals` resolver ships by default. Encrypted tables (notes, kontext, tasks, events, journal, …) are intentionally **not** resolved server-side; those missions depend on the foreground runner which decrypts client-side. See `resolvers/types.ts` for the privacy rationale.
 - [x] **Materialized mission snapshots** — `mana_ai.mission_snapshots` table with per-tick incremental refresh (`db/snapshot-refresh.ts`). `listDueMissions` is now a single indexed SELECT; the prior O(N changes) LWW replay stays only in `mergeAndFilter` for tests. Idempotent `migrate()` on boot creates the schema.
+- [x] **Prometheus metrics** on `/metrics` — process defaults with
+      `mana_ai_` prefix + counters (`mana_ai_ticks_total`,
+      `mana_ai_plans_produced_total`, `mana_ai_plans_written_back_total`,
+      `mana_ai_parse_failures_total`, `mana_ai_mission_errors_total`,
+      `mana_ai_snapshots_*`) and histograms (`mana_ai_tick_duration_seconds`,
+      `mana_ai_planner_request_duration_seconds`,
+      `mana_ai_http_request_duration_seconds`). Scraped 30s by
+      `docker/prometheus/prometheus.yml`'s `mana-ai` job. `/health` is
+      also blackbox-probed and surfaces on **status.mana.how** under
+      "Internal" as "Mana AI Runner".

 All roadmap items shipped. Future polish (not blockers):
 - Multi-instance deploy with advisory locks on snapshot refresh (today single-process)
 - Read-only `/internal/missions/:userId` endpoint for ops inspection
- Metrics endpoint for Prometheus (tick latency, plans/hour, parse-failure rate)

 ## Port: 3066

--- a/services/mana-ai/package.json
+++ b/services/mana-ai/package.json
@ -12,7 +12,8 @@
 		"@mana/shared-ai": "workspace:*",
 		"@mana/shared-hono": "workspace:*",
 		"hono": "^4.7.0",
-		"postgres": "^3.4.5"
+		"postgres": "^3.4.5",
+		"prom-client": "^15.1.3"
 	},
 	"devDependencies": {
 		"typescript": "^5.9.3",
--- a/services/mana-ai/src/cron/tick.ts
+++ b/services/mana-ai/src/cron/tick.ts
@ -27,6 +27,17 @@ import { appendServerIteration, planToIteration } from '../db/iteration-writer';
 import { refreshSnapshots } from '../db/snapshot-refresh';
 import { PlannerClient } from '../planner/client';
 import { AI_AVAILABLE_TOOLS, AI_AVAILABLE_TOOL_NAMES } from '../planner/tools';
+import {
+	ticksTotal,
+	tickDuration,
+	plansProducedTotal,
+	plansWrittenBackTotal,
+	parseFailuresTotal,
+	missionErrorsTotal,
+	snapshotsNewTotal,
+	snapshotsUpdatedTotal,
+	snapshotRowsAppliedTotal,
+} from '../metrics';
 import type { Config } from '../config';

 export interface TickStats {
@ -53,6 +64,8 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
 		};
 	}
 	running = true;
+	ticksTotal.inc();
+	const tickEndTimer = tickDuration.startTimer();
 	const errors: string[] = [];
 	let dueMissionCount = 0;
 	let plansProduced = 0;
@ -65,6 +78,9 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
 		// Bring the snapshot table up to date before querying it —
 		// cheap incremental pass, O(new changes since last tick).
 		const refresh = await refreshSnapshots(sql);
+		snapshotsNewTotal.inc(refresh.newSnapshots);
+		snapshotsUpdatedTotal.inc(refresh.updatedSnapshots);
+		snapshotRowsAppliedTotal.inc(refresh.rowsApplied);
 		if (refresh.rowsApplied > 0) {
 			console.log(
 				`[mana-ai tick] snapshot refresh: ${refresh.rowsApplied} rows → ${refresh.newSnapshots} new + ${refresh.updatedSnapshots} updated`
@ -90,9 +106,11 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
 				const plan = await planOneMission(m, planner, sql);
 				if (plan === null) {
 					parseFailures++;
+					parseFailuresTotal.inc();
 					continue;
 				}
 				plansProduced++;
+				plansProducedTotal.inc();

 				const nowIso = new Date().toISOString();
 				const iterationId = crypto.randomUUID();
@ -107,6 +125,7 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
 					nowIso,
 				});
 				plansWrittenBack++;
+				plansWrittenBackTotal.inc();

 				console.log(
 					`[mana-ai tick] mission=${m.id} user=${m.userId} plan=${plan.steps.length}step(s) iteration=${iterationId}`
@ -114,6 +133,7 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
 			} catch (err) {
 				const msg = err instanceof Error ? err.message : String(err);
 				errors.push(`mission=${m.id}: ${msg}`);
+				missionErrorsTotal.inc();
 				console.error(`[mana-ai tick] mission=${m.id} run failed:`, msg);
 			}
 		}
@ -123,6 +143,7 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
 		console.error('[mana-ai tick] scan error:', msg);
 	} finally {
 		running = false;
+		tickEndTimer();
 	}

 	return { scannedAt, dueMissionCount, plansProduced, plansWrittenBack, parseFailures, errors };
--- a/services/mana-ai/src/index.ts
+++ b/services/mana-ai/src/index.ts
@ -16,6 +16,7 @@ import { closeSql, getSql } from './db/connection';
 import { migrate } from './db/migrate';
 import { runTickOnce, startTick, stopTick, isTickRunning } from './cron/tick';
 import { serviceAuth } from './middleware/service-auth';
+import { register, httpRequestsTotal, httpRequestDuration } from './metrics';

 const config = loadConfig();

@ -25,15 +26,33 @@ await migrate(getSql(config.syncDatabaseUrl));

 const app = new Hono();

+// HTTP instrumentation — labels by method/path/status, surfaced on /metrics.
+app.use('*', async (c, next) => {
+	const start = Date.now();
+	await next();
+	const duration = (Date.now() - start) / 1000;
+	const path = c.req.routePath || c.req.path;
+	const labels = { method: c.req.method, path, status: c.res.status };
+	httpRequestsTotal.inc(labels);
+	httpRequestDuration.observe(labels, duration);
+});
+
 app.get('/health', (c) =>
 	c.json({
 		ok: true,
 		service: 'mana-ai',
-		version: '0.1.0',
+		version: '0.4.0',
 		tick: { enabled: config.tickEnabled, running: isTickRunning() },
 	})
 );

+// Prometheus scrape target. Scraped by docker/prometheus/prometheus.yml's
+// `mana-ai` job every 30s.
+app.get('/metrics', async (c) => {
+	c.header('Content-Type', register.contentType);
+	return c.text(await register.metrics());
+});
+
 // Service-to-service: manually fire a tick for CI / ops / debugging
 // without waiting for the interval.
 app.use('/internal/*', serviceAuth(config.serviceKey));
--- a/services/mana-ai/src/metrics.ts
+++ b/services/mana-ai/src/metrics.ts
@ -0,0 +1,100 @@
+/**
+ * Prometheus metrics — exported on GET /metrics.
+ *
+ * Follows the same shape as mana-media (default metrics with a service
+ * prefix, plus a handful of service-specific counters + histograms) so
+ * the existing Grafana dashboards and the status.mana.how generator
+ * recognise this service without special-casing.
+ *
+ * Metric naming: `mana_ai_*`. Underscore separator keeps Prometheus's
+ * standard-compliant regex `[a-zA-Z_:][a-zA-Z0-9_:]*` happy.
+ */
+
+import { Counter, Histogram, Registry, collectDefaultMetrics } from 'prom-client';
+
+export const register = new Registry();
+register.setDefaultLabels({ service: 'mana-ai' });
+collectDefaultMetrics({ register, prefix: 'mana_ai_' });
+
+// ── HTTP surface ──────────────────────────────────────────
+
+export const httpRequestsTotal = new Counter({
+	name: 'mana_ai_http_requests_total',
+	help: 'Total HTTP requests received.',
+	labelNames: ['method', 'path', 'status'] as const,
+	registers: [register],
+});
+
+export const httpRequestDuration = new Histogram({
+	name: 'mana_ai_http_request_duration_seconds',
+	help: 'Latency per HTTP request.',
+	labelNames: ['method', 'path', 'status'] as const,
+	buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
+	registers: [register],
+});
+
+// ── Mission runner — service-specific ─────────────────────
+
+export const ticksTotal = new Counter({
+	name: 'mana_ai_ticks_total',
+	help: 'Total tick loop runs (all completions, including empty ones).',
+	registers: [register],
+});
+
+export const tickDuration = new Histogram({
+	name: 'mana_ai_tick_duration_seconds',
+	help: 'Wall-clock time spent inside one tick pass.',
+	buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120],
+	registers: [register],
+});
+
+export const plansProducedTotal = new Counter({
+	name: 'mana_ai_plans_produced_total',
+	help: 'Total plans the Planner returned parseable output for.',
+	registers: [register],
+});
+
+export const plansWrittenBackTotal = new Counter({
+	name: 'mana_ai_plans_written_back_total',
+	help: 'Total plans persisted as server iterations on aiMissions.',
+	registers: [register],
+});
+
+export const parseFailuresTotal = new Counter({
+	name: 'mana_ai_parse_failures_total',
+	help: 'Planner responses that failed JSON / shape validation.',
+	registers: [register],
+});
+
+export const missionErrorsTotal = new Counter({
+	name: 'mana_ai_mission_errors_total',
+	help: 'Errors thrown while processing a single mission within a tick.',
+	registers: [register],
+});
+
+export const plannerLatency = new Histogram({
+	name: 'mana_ai_planner_request_duration_seconds',
+	help: 'Latency of calls to the mana-llm backend.',
+	buckets: [0.25, 0.5, 1, 2, 5, 10, 30, 60],
+	registers: [register],
+});
+
+// ── Snapshot refresh ──────────────────────────────────────
+
+export const snapshotsNewTotal = new Counter({
+	name: 'mana_ai_snapshots_new_total',
+	help: 'Mission-snapshot rows created on first sighting.',
+	registers: [register],
+});
+
+export const snapshotsUpdatedTotal = new Counter({
+	name: 'mana_ai_snapshots_updated_total',
+	help: 'Mission-snapshot rows updated with a delta.',
+	registers: [register],
+});
+
+export const snapshotRowsAppliedTotal = new Counter({
+	name: 'mana_ai_snapshot_rows_applied_total',
+	help: 'Sync-changes rows folded into the snapshot cache.',
+	registers: [register],
+});
--- a/services/mana-ai/src/planner/client.ts
+++ b/services/mana-ai/src/planner/client.ts
@ -7,6 +7,8 @@
 * webapp as source of truth for now while the service matures.
 */

+import { plannerLatency } from '../metrics';
+
 export interface PlannerMessages {
 	system: string;
 	user: string;
@ -26,6 +28,18 @@ export class PlannerClient {
 	async complete(
 		messages: PlannerMessages,
 		opts: { model?: string; temperature?: number } = {}
+	): Promise<PlannerResult> {
+		const endTimer = plannerLatency.startTimer();
+		try {
+			return await this.doComplete(messages, opts);
+		} finally {
+			endTimer();
+		}
+	}
+
+	private async doComplete(
+		messages: PlannerMessages,
+		opts: { model?: string; temperature?: number }
 	): Promise<PlannerResult> {
 		const res = await fetch(`${this.baseUrl}/v1/chat/completions`, {
 			method: 'POST',