diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 190eba4e8..96f74e18d 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -123,6 +123,15 @@ scrape_configs: metrics_path: '/metrics' scrape_interval: 30s + # Mana AI Service (Bun) — background Mission Runner for the AI Workbench. + # Exposes tick stats, planner-request latencies, snapshot refresh + # counters, and standard HTTP metrics at /metrics. + - job_name: 'mana-ai' + static_configs: + - targets: ['mana-ai:3066'] + metrics_path: '/metrics' + scrape_interval: 30s + # ============================================ # GPU Server (Windows PC, LAN: 192.168.178.11) # ============================================ @@ -297,6 +306,8 @@ scrape_configs: # Upstream Pelias health, proxied through the wrapper so the # blackbox-exporter doesn't need host.docker.internal access. - http://mana-geocoding:3018/health/pelias + # mana-ai (Mission Runner) — internal-only, no CF tunnel. + - http://mana-ai:3066/health relabel_configs: - source_labels: [__address__] target_label: __param_target diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 02938a4bc..6e2470d04 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -3297,6 +3297,9 @@ importers: postgres: specifier: ^3.4.5 version: 3.4.9 + prom-client: + specifier: ^15.1.3 + version: 15.1.3 devDependencies: '@types/bun': specifier: latest diff --git a/scripts/generate-status-page.sh b/scripts/generate-status-page.sh index 65588cd5a..c83c9598b 100755 --- a/scripts/generate-status-page.sh +++ b/scripts/generate-status-page.sh @@ -75,6 +75,9 @@ friendly_name() { mana-geocoding:*) name="Mana Geocoding" ;; + mana-ai:*) + name="Mana AI Runner" + ;; mana.how/*) name="${name#mana.how/}" ;; diff --git a/services/mana-ai/CLAUDE.md b/services/mana-ai/CLAUDE.md index a30dd1f89..47a41ab97 100644 --- a/services/mana-ai/CLAUDE.md +++ b/services/mana-ai/CLAUDE.md @@ -19,11 +19,20 @@ What works end-to-end: - [x] Webapp staging effect (`server-iteration-staging.ts`) picks up the synced iteration and translates each PlanStep into a local Proposal with full AI-actor attribution (missionId + iterationId + rationale). Idempotent via durable `proposalId` markers. - [x] **Server-side input resolvers** for plaintext tables — `db/resolvers/` with a pluggable registry + single-record LWW replay (`record-replay.ts`). `goals` resolver ships by default. Encrypted tables (notes, kontext, tasks, events, journal, …) are intentionally **not** resolved server-side; those missions depend on the foreground runner which decrypts client-side. See `resolvers/types.ts` for the privacy rationale. - [x] **Materialized mission snapshots** — `mana_ai.mission_snapshots` table with per-tick incremental refresh (`db/snapshot-refresh.ts`). `listDueMissions` is now a single indexed SELECT; the prior O(N changes) LWW replay stays only in `mergeAndFilter` for tests. Idempotent `migrate()` on boot creates the schema. +- [x] **Prometheus metrics** on `/metrics` — process defaults with + `mana_ai_` prefix + counters (`mana_ai_ticks_total`, + `mana_ai_plans_produced_total`, `mana_ai_plans_written_back_total`, + `mana_ai_parse_failures_total`, `mana_ai_mission_errors_total`, + `mana_ai_snapshots_*`) and histograms (`mana_ai_tick_duration_seconds`, + `mana_ai_planner_request_duration_seconds`, + `mana_ai_http_request_duration_seconds`). Scraped 30s by + `docker/prometheus/prometheus.yml`'s `mana-ai` job. `/health` is + also blackbox-probed and surfaces on **status.mana.how** under + "Internal" as "Mana AI Runner". All roadmap items shipped. Future polish (not blockers): - Multi-instance deploy with advisory locks on snapshot refresh (today single-process) - Read-only `/internal/missions/:userId` endpoint for ops inspection -- Metrics endpoint for Prometheus (tick latency, plans/hour, parse-failure rate) ## Port: 3066 diff --git a/services/mana-ai/package.json b/services/mana-ai/package.json index d9fb264a9..2abdc4df5 100644 --- a/services/mana-ai/package.json +++ b/services/mana-ai/package.json @@ -12,7 +12,8 @@ "@mana/shared-ai": "workspace:*", "@mana/shared-hono": "workspace:*", "hono": "^4.7.0", - "postgres": "^3.4.5" + "postgres": "^3.4.5", + "prom-client": "^15.1.3" }, "devDependencies": { "typescript": "^5.9.3", diff --git a/services/mana-ai/src/cron/tick.ts b/services/mana-ai/src/cron/tick.ts index 99acb7d19..e1d0b8d67 100644 --- a/services/mana-ai/src/cron/tick.ts +++ b/services/mana-ai/src/cron/tick.ts @@ -27,6 +27,17 @@ import { appendServerIteration, planToIteration } from '../db/iteration-writer'; import { refreshSnapshots } from '../db/snapshot-refresh'; import { PlannerClient } from '../planner/client'; import { AI_AVAILABLE_TOOLS, AI_AVAILABLE_TOOL_NAMES } from '../planner/tools'; +import { + ticksTotal, + tickDuration, + plansProducedTotal, + plansWrittenBackTotal, + parseFailuresTotal, + missionErrorsTotal, + snapshotsNewTotal, + snapshotsUpdatedTotal, + snapshotRowsAppliedTotal, +} from '../metrics'; import type { Config } from '../config'; export interface TickStats { @@ -53,6 +64,8 @@ export async function runTickOnce(config: Config): Promise { }; } running = true; + ticksTotal.inc(); + const tickEndTimer = tickDuration.startTimer(); const errors: string[] = []; let dueMissionCount = 0; let plansProduced = 0; @@ -65,6 +78,9 @@ export async function runTickOnce(config: Config): Promise { // Bring the snapshot table up to date before querying it — // cheap incremental pass, O(new changes since last tick). const refresh = await refreshSnapshots(sql); + snapshotsNewTotal.inc(refresh.newSnapshots); + snapshotsUpdatedTotal.inc(refresh.updatedSnapshots); + snapshotRowsAppliedTotal.inc(refresh.rowsApplied); if (refresh.rowsApplied > 0) { console.log( `[mana-ai tick] snapshot refresh: ${refresh.rowsApplied} rows → ${refresh.newSnapshots} new + ${refresh.updatedSnapshots} updated` @@ -90,9 +106,11 @@ export async function runTickOnce(config: Config): Promise { const plan = await planOneMission(m, planner, sql); if (plan === null) { parseFailures++; + parseFailuresTotal.inc(); continue; } plansProduced++; + plansProducedTotal.inc(); const nowIso = new Date().toISOString(); const iterationId = crypto.randomUUID(); @@ -107,6 +125,7 @@ export async function runTickOnce(config: Config): Promise { nowIso, }); plansWrittenBack++; + plansWrittenBackTotal.inc(); console.log( `[mana-ai tick] mission=${m.id} user=${m.userId} plan=${plan.steps.length}step(s) iteration=${iterationId}` @@ -114,6 +133,7 @@ export async function runTickOnce(config: Config): Promise { } catch (err) { const msg = err instanceof Error ? err.message : String(err); errors.push(`mission=${m.id}: ${msg}`); + missionErrorsTotal.inc(); console.error(`[mana-ai tick] mission=${m.id} run failed:`, msg); } } @@ -123,6 +143,7 @@ export async function runTickOnce(config: Config): Promise { console.error('[mana-ai tick] scan error:', msg); } finally { running = false; + tickEndTimer(); } return { scannedAt, dueMissionCount, plansProduced, plansWrittenBack, parseFailures, errors }; diff --git a/services/mana-ai/src/index.ts b/services/mana-ai/src/index.ts index 87b426824..2c819e22a 100644 --- a/services/mana-ai/src/index.ts +++ b/services/mana-ai/src/index.ts @@ -16,6 +16,7 @@ import { closeSql, getSql } from './db/connection'; import { migrate } from './db/migrate'; import { runTickOnce, startTick, stopTick, isTickRunning } from './cron/tick'; import { serviceAuth } from './middleware/service-auth'; +import { register, httpRequestsTotal, httpRequestDuration } from './metrics'; const config = loadConfig(); @@ -25,15 +26,33 @@ await migrate(getSql(config.syncDatabaseUrl)); const app = new Hono(); +// HTTP instrumentation — labels by method/path/status, surfaced on /metrics. +app.use('*', async (c, next) => { + const start = Date.now(); + await next(); + const duration = (Date.now() - start) / 1000; + const path = c.req.routePath || c.req.path; + const labels = { method: c.req.method, path, status: c.res.status }; + httpRequestsTotal.inc(labels); + httpRequestDuration.observe(labels, duration); +}); + app.get('/health', (c) => c.json({ ok: true, service: 'mana-ai', - version: '0.1.0', + version: '0.4.0', tick: { enabled: config.tickEnabled, running: isTickRunning() }, }) ); +// Prometheus scrape target. Scraped by docker/prometheus/prometheus.yml's +// `mana-ai` job every 30s. +app.get('/metrics', async (c) => { + c.header('Content-Type', register.contentType); + return c.text(await register.metrics()); +}); + // Service-to-service: manually fire a tick for CI / ops / debugging // without waiting for the interval. app.use('/internal/*', serviceAuth(config.serviceKey)); diff --git a/services/mana-ai/src/metrics.ts b/services/mana-ai/src/metrics.ts new file mode 100644 index 000000000..498ae0907 --- /dev/null +++ b/services/mana-ai/src/metrics.ts @@ -0,0 +1,100 @@ +/** + * Prometheus metrics — exported on GET /metrics. + * + * Follows the same shape as mana-media (default metrics with a service + * prefix, plus a handful of service-specific counters + histograms) so + * the existing Grafana dashboards and the status.mana.how generator + * recognise this service without special-casing. + * + * Metric naming: `mana_ai_*`. Underscore separator keeps Prometheus's + * standard-compliant regex `[a-zA-Z_:][a-zA-Z0-9_:]*` happy. + */ + +import { Counter, Histogram, Registry, collectDefaultMetrics } from 'prom-client'; + +export const register = new Registry(); +register.setDefaultLabels({ service: 'mana-ai' }); +collectDefaultMetrics({ register, prefix: 'mana_ai_' }); + +// ── HTTP surface ────────────────────────────────────────── + +export const httpRequestsTotal = new Counter({ + name: 'mana_ai_http_requests_total', + help: 'Total HTTP requests received.', + labelNames: ['method', 'path', 'status'] as const, + registers: [register], +}); + +export const httpRequestDuration = new Histogram({ + name: 'mana_ai_http_request_duration_seconds', + help: 'Latency per HTTP request.', + labelNames: ['method', 'path', 'status'] as const, + buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], + registers: [register], +}); + +// ── Mission runner — service-specific ───────────────────── + +export const ticksTotal = new Counter({ + name: 'mana_ai_ticks_total', + help: 'Total tick loop runs (all completions, including empty ones).', + registers: [register], +}); + +export const tickDuration = new Histogram({ + name: 'mana_ai_tick_duration_seconds', + help: 'Wall-clock time spent inside one tick pass.', + buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120], + registers: [register], +}); + +export const plansProducedTotal = new Counter({ + name: 'mana_ai_plans_produced_total', + help: 'Total plans the Planner returned parseable output for.', + registers: [register], +}); + +export const plansWrittenBackTotal = new Counter({ + name: 'mana_ai_plans_written_back_total', + help: 'Total plans persisted as server iterations on aiMissions.', + registers: [register], +}); + +export const parseFailuresTotal = new Counter({ + name: 'mana_ai_parse_failures_total', + help: 'Planner responses that failed JSON / shape validation.', + registers: [register], +}); + +export const missionErrorsTotal = new Counter({ + name: 'mana_ai_mission_errors_total', + help: 'Errors thrown while processing a single mission within a tick.', + registers: [register], +}); + +export const plannerLatency = new Histogram({ + name: 'mana_ai_planner_request_duration_seconds', + help: 'Latency of calls to the mana-llm backend.', + buckets: [0.25, 0.5, 1, 2, 5, 10, 30, 60], + registers: [register], +}); + +// ── Snapshot refresh ────────────────────────────────────── + +export const snapshotsNewTotal = new Counter({ + name: 'mana_ai_snapshots_new_total', + help: 'Mission-snapshot rows created on first sighting.', + registers: [register], +}); + +export const snapshotsUpdatedTotal = new Counter({ + name: 'mana_ai_snapshots_updated_total', + help: 'Mission-snapshot rows updated with a delta.', + registers: [register], +}); + +export const snapshotRowsAppliedTotal = new Counter({ + name: 'mana_ai_snapshot_rows_applied_total', + help: 'Sync-changes rows folded into the snapshot cache.', + registers: [register], +}); diff --git a/services/mana-ai/src/planner/client.ts b/services/mana-ai/src/planner/client.ts index 2bea51033..8bc12c1a7 100644 --- a/services/mana-ai/src/planner/client.ts +++ b/services/mana-ai/src/planner/client.ts @@ -7,6 +7,8 @@ * webapp as source of truth for now while the service matures. */ +import { plannerLatency } from '../metrics'; + export interface PlannerMessages { system: string; user: string; @@ -26,6 +28,18 @@ export class PlannerClient { async complete( messages: PlannerMessages, opts: { model?: string; temperature?: number } = {} + ): Promise { + const endTimer = plannerLatency.startTimer(); + try { + return await this.doComplete(messages, opts); + } finally { + endTimer(); + } + } + + private async doComplete( + messages: PlannerMessages, + opts: { model?: string; temperature?: number } ): Promise { const res = await fetch(`${this.baseUrl}/v1/chat/completions`, { method: 'POST',