feat(mana-ai): Prometheus /metrics endpoint + status.mana.how integration

Wires mana-ai into the existing observability stack so tick throughput,
plan-failure rates, planner latencies, and snapshot refresh health are
visible in Grafana + Prometheus, and the service's uptime surfaces on
status.mana.how under the "Internal" section.

- `src/metrics.ts` — prom-client Registry with `mana_ai_` prefix.
  Counters: ticks_total, plans_produced_total, plans_written_back_total,
  parse_failures_total, mission_errors_total, snapshots_new/updated,
  snapshot_rows_applied_total, http_requests_total.
  Histograms: tick_duration_seconds (0.1–120s), planner_request_
  duration_seconds (0.25–60s), http_request_duration_seconds (0.005–10s).
- `src/index.ts` — HTTP middleware labels every request by
  method/path/status; `/metrics` serves the Prometheus text format.
- `src/cron/tick.ts` — increments counters + wraps the tick with
  `tickDuration.startTimer()`. Snapshot stats fold through.
- `src/planner/client.ts` — wraps `complete()` in a latency histogram
  timer so planner tail latency shows up separately from tick duration.
- `docker/prometheus/prometheus.yml` —
  1. New `mana-ai` scrape job against `mana-ai:3066/metrics` (30s).
  2. `/health` added to the `blackbox-internal` job so uptime shows on
     status.mana.how alongside mana-geocoding.
- `scripts/generate-status-page.sh` — friendly label for the new probe:
  `mana-ai:3066/health` → "Mana AI Runner" (generator already iterates
  `blackbox-internal`, no other changes needed).
- `package.json` — prom-client ^15.1.3

All 17 Bun tests still pass; tsc clean.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-15 01:41:40 +02:00
parent 767b64cdd4
commit 0bf01f434e
9 changed files with 184 additions and 3 deletions

View file

@ -123,6 +123,15 @@ scrape_configs:
metrics_path: '/metrics'
scrape_interval: 30s
# Mana AI Service (Bun) — background Mission Runner for the AI Workbench.
# Exposes tick stats, planner-request latencies, snapshot refresh
# counters, and standard HTTP metrics at /metrics.
- job_name: 'mana-ai'
static_configs:
- targets: ['mana-ai:3066']
metrics_path: '/metrics'
scrape_interval: 30s
# ============================================
# GPU Server (Windows PC, LAN: 192.168.178.11)
# ============================================
@ -297,6 +306,8 @@ scrape_configs:
# Upstream Pelias health, proxied through the wrapper so the
# blackbox-exporter doesn't need host.docker.internal access.
- http://mana-geocoding:3018/health/pelias
# mana-ai (Mission Runner) — internal-only, no CF tunnel.
- http://mana-ai:3066/health
relabel_configs:
- source_labels: [__address__]
target_label: __param_target

3
pnpm-lock.yaml generated
View file

@ -3297,6 +3297,9 @@ importers:
postgres:
specifier: ^3.4.5
version: 3.4.9
prom-client:
specifier: ^15.1.3
version: 15.1.3
devDependencies:
'@types/bun':
specifier: latest

View file

@ -75,6 +75,9 @@ friendly_name() {
mana-geocoding:*)
name="Mana Geocoding"
;;
mana-ai:*)
name="Mana AI Runner"
;;
mana.how/*)
name="${name#mana.how/}"
;;

View file

@ -19,11 +19,20 @@ What works end-to-end:
- [x] Webapp staging effect (`server-iteration-staging.ts`) picks up the synced iteration and translates each PlanStep into a local Proposal with full AI-actor attribution (missionId + iterationId + rationale). Idempotent via durable `proposalId` markers.
- [x] **Server-side input resolvers** for plaintext tables — `db/resolvers/` with a pluggable registry + single-record LWW replay (`record-replay.ts`). `goals` resolver ships by default. Encrypted tables (notes, kontext, tasks, events, journal, …) are intentionally **not** resolved server-side; those missions depend on the foreground runner which decrypts client-side. See `resolvers/types.ts` for the privacy rationale.
- [x] **Materialized mission snapshots**`mana_ai.mission_snapshots` table with per-tick incremental refresh (`db/snapshot-refresh.ts`). `listDueMissions` is now a single indexed SELECT; the prior O(N changes) LWW replay stays only in `mergeAndFilter` for tests. Idempotent `migrate()` on boot creates the schema.
- [x] **Prometheus metrics** on `/metrics` — process defaults with
`mana_ai_` prefix + counters (`mana_ai_ticks_total`,
`mana_ai_plans_produced_total`, `mana_ai_plans_written_back_total`,
`mana_ai_parse_failures_total`, `mana_ai_mission_errors_total`,
`mana_ai_snapshots_*`) and histograms (`mana_ai_tick_duration_seconds`,
`mana_ai_planner_request_duration_seconds`,
`mana_ai_http_request_duration_seconds`). Scraped 30s by
`docker/prometheus/prometheus.yml`'s `mana-ai` job. `/health` is
also blackbox-probed and surfaces on **status.mana.how** under
"Internal" as "Mana AI Runner".
All roadmap items shipped. Future polish (not blockers):
- Multi-instance deploy with advisory locks on snapshot refresh (today single-process)
- Read-only `/internal/missions/:userId` endpoint for ops inspection
- Metrics endpoint for Prometheus (tick latency, plans/hour, parse-failure rate)
## Port: 3066

View file

@ -12,7 +12,8 @@
"@mana/shared-ai": "workspace:*",
"@mana/shared-hono": "workspace:*",
"hono": "^4.7.0",
"postgres": "^3.4.5"
"postgres": "^3.4.5",
"prom-client": "^15.1.3"
},
"devDependencies": {
"typescript": "^5.9.3",

View file

@ -27,6 +27,17 @@ import { appendServerIteration, planToIteration } from '../db/iteration-writer';
import { refreshSnapshots } from '../db/snapshot-refresh';
import { PlannerClient } from '../planner/client';
import { AI_AVAILABLE_TOOLS, AI_AVAILABLE_TOOL_NAMES } from '../planner/tools';
import {
ticksTotal,
tickDuration,
plansProducedTotal,
plansWrittenBackTotal,
parseFailuresTotal,
missionErrorsTotal,
snapshotsNewTotal,
snapshotsUpdatedTotal,
snapshotRowsAppliedTotal,
} from '../metrics';
import type { Config } from '../config';
export interface TickStats {
@ -53,6 +64,8 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
};
}
running = true;
ticksTotal.inc();
const tickEndTimer = tickDuration.startTimer();
const errors: string[] = [];
let dueMissionCount = 0;
let plansProduced = 0;
@ -65,6 +78,9 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
// Bring the snapshot table up to date before querying it —
// cheap incremental pass, O(new changes since last tick).
const refresh = await refreshSnapshots(sql);
snapshotsNewTotal.inc(refresh.newSnapshots);
snapshotsUpdatedTotal.inc(refresh.updatedSnapshots);
snapshotRowsAppliedTotal.inc(refresh.rowsApplied);
if (refresh.rowsApplied > 0) {
console.log(
`[mana-ai tick] snapshot refresh: ${refresh.rowsApplied} rows → ${refresh.newSnapshots} new + ${refresh.updatedSnapshots} updated`
@ -90,9 +106,11 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
const plan = await planOneMission(m, planner, sql);
if (plan === null) {
parseFailures++;
parseFailuresTotal.inc();
continue;
}
plansProduced++;
plansProducedTotal.inc();
const nowIso = new Date().toISOString();
const iterationId = crypto.randomUUID();
@ -107,6 +125,7 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
nowIso,
});
plansWrittenBack++;
plansWrittenBackTotal.inc();
console.log(
`[mana-ai tick] mission=${m.id} user=${m.userId} plan=${plan.steps.length}step(s) iteration=${iterationId}`
@ -114,6 +133,7 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
errors.push(`mission=${m.id}: ${msg}`);
missionErrorsTotal.inc();
console.error(`[mana-ai tick] mission=${m.id} run failed:`, msg);
}
}
@ -123,6 +143,7 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
console.error('[mana-ai tick] scan error:', msg);
} finally {
running = false;
tickEndTimer();
}
return { scannedAt, dueMissionCount, plansProduced, plansWrittenBack, parseFailures, errors };

View file

@ -16,6 +16,7 @@ import { closeSql, getSql } from './db/connection';
import { migrate } from './db/migrate';
import { runTickOnce, startTick, stopTick, isTickRunning } from './cron/tick';
import { serviceAuth } from './middleware/service-auth';
import { register, httpRequestsTotal, httpRequestDuration } from './metrics';
const config = loadConfig();
@ -25,15 +26,33 @@ await migrate(getSql(config.syncDatabaseUrl));
const app = new Hono();
// HTTP instrumentation — labels by method/path/status, surfaced on /metrics.
app.use('*', async (c, next) => {
const start = Date.now();
await next();
const duration = (Date.now() - start) / 1000;
const path = c.req.routePath || c.req.path;
const labels = { method: c.req.method, path, status: c.res.status };
httpRequestsTotal.inc(labels);
httpRequestDuration.observe(labels, duration);
});
app.get('/health', (c) =>
c.json({
ok: true,
service: 'mana-ai',
version: '0.1.0',
version: '0.4.0',
tick: { enabled: config.tickEnabled, running: isTickRunning() },
})
);
// Prometheus scrape target. Scraped by docker/prometheus/prometheus.yml's
// `mana-ai` job every 30s.
app.get('/metrics', async (c) => {
c.header('Content-Type', register.contentType);
return c.text(await register.metrics());
});
// Service-to-service: manually fire a tick for CI / ops / debugging
// without waiting for the interval.
app.use('/internal/*', serviceAuth(config.serviceKey));

View file

@ -0,0 +1,100 @@
/**
* Prometheus metrics exported on GET /metrics.
*
* Follows the same shape as mana-media (default metrics with a service
* prefix, plus a handful of service-specific counters + histograms) so
* the existing Grafana dashboards and the status.mana.how generator
* recognise this service without special-casing.
*
* Metric naming: `mana_ai_*`. Underscore separator keeps Prometheus's
* standard-compliant regex `[a-zA-Z_:][a-zA-Z0-9_:]*` happy.
*/
import { Counter, Histogram, Registry, collectDefaultMetrics } from 'prom-client';
export const register = new Registry();
register.setDefaultLabels({ service: 'mana-ai' });
collectDefaultMetrics({ register, prefix: 'mana_ai_' });
// ── HTTP surface ──────────────────────────────────────────
export const httpRequestsTotal = new Counter({
name: 'mana_ai_http_requests_total',
help: 'Total HTTP requests received.',
labelNames: ['method', 'path', 'status'] as const,
registers: [register],
});
export const httpRequestDuration = new Histogram({
name: 'mana_ai_http_request_duration_seconds',
help: 'Latency per HTTP request.',
labelNames: ['method', 'path', 'status'] as const,
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [register],
});
// ── Mission runner — service-specific ─────────────────────
export const ticksTotal = new Counter({
name: 'mana_ai_ticks_total',
help: 'Total tick loop runs (all completions, including empty ones).',
registers: [register],
});
export const tickDuration = new Histogram({
name: 'mana_ai_tick_duration_seconds',
help: 'Wall-clock time spent inside one tick pass.',
buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120],
registers: [register],
});
export const plansProducedTotal = new Counter({
name: 'mana_ai_plans_produced_total',
help: 'Total plans the Planner returned parseable output for.',
registers: [register],
});
export const plansWrittenBackTotal = new Counter({
name: 'mana_ai_plans_written_back_total',
help: 'Total plans persisted as server iterations on aiMissions.',
registers: [register],
});
export const parseFailuresTotal = new Counter({
name: 'mana_ai_parse_failures_total',
help: 'Planner responses that failed JSON / shape validation.',
registers: [register],
});
export const missionErrorsTotal = new Counter({
name: 'mana_ai_mission_errors_total',
help: 'Errors thrown while processing a single mission within a tick.',
registers: [register],
});
export const plannerLatency = new Histogram({
name: 'mana_ai_planner_request_duration_seconds',
help: 'Latency of calls to the mana-llm backend.',
buckets: [0.25, 0.5, 1, 2, 5, 10, 30, 60],
registers: [register],
});
// ── Snapshot refresh ──────────────────────────────────────
export const snapshotsNewTotal = new Counter({
name: 'mana_ai_snapshots_new_total',
help: 'Mission-snapshot rows created on first sighting.',
registers: [register],
});
export const snapshotsUpdatedTotal = new Counter({
name: 'mana_ai_snapshots_updated_total',
help: 'Mission-snapshot rows updated with a delta.',
registers: [register],
});
export const snapshotRowsAppliedTotal = new Counter({
name: 'mana_ai_snapshot_rows_applied_total',
help: 'Sync-changes rows folded into the snapshot cache.',
registers: [register],
});

View file

@ -7,6 +7,8 @@
* webapp as source of truth for now while the service matures.
*/
import { plannerLatency } from '../metrics';
export interface PlannerMessages {
system: string;
user: string;
@ -26,6 +28,18 @@ export class PlannerClient {
async complete(
messages: PlannerMessages,
opts: { model?: string; temperature?: number } = {}
): Promise<PlannerResult> {
const endTimer = plannerLatency.startTimer();
try {
return await this.doComplete(messages, opts);
} finally {
endTimer();
}
}
private async doComplete(
messages: PlannerMessages,
opts: { model?: string; temperature?: number }
): Promise<PlannerResult> {
const res = await fetch(`${this.baseUrl}/v1/chat/completions`, {
method: 'POST',