mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 20:01:09 +02:00
feat(mana-ai): Prometheus /metrics endpoint + status.mana.how integration
Wires mana-ai into the existing observability stack so tick throughput,
plan-failure rates, planner latencies, and snapshot refresh health are
visible in Grafana + Prometheus, and the service's uptime surfaces on
status.mana.how under the "Internal" section.
- `src/metrics.ts` — prom-client Registry with `mana_ai_` prefix.
Counters: ticks_total, plans_produced_total, plans_written_back_total,
parse_failures_total, mission_errors_total, snapshots_new/updated,
snapshot_rows_applied_total, http_requests_total.
Histograms: tick_duration_seconds (0.1–120s), planner_request_
duration_seconds (0.25–60s), http_request_duration_seconds (0.005–10s).
- `src/index.ts` — HTTP middleware labels every request by
method/path/status; `/metrics` serves the Prometheus text format.
- `src/cron/tick.ts` — increments counters + wraps the tick with
`tickDuration.startTimer()`. Snapshot stats fold through.
- `src/planner/client.ts` — wraps `complete()` in a latency histogram
timer so planner tail latency shows up separately from tick duration.
- `docker/prometheus/prometheus.yml` —
1. New `mana-ai` scrape job against `mana-ai:3066/metrics` (30s).
2. `/health` added to the `blackbox-internal` job so uptime shows on
status.mana.how alongside mana-geocoding.
- `scripts/generate-status-page.sh` — friendly label for the new probe:
`mana-ai:3066/health` → "Mana AI Runner" (generator already iterates
`blackbox-internal`, no other changes needed).
- `package.json` — prom-client ^15.1.3
All 17 Bun tests still pass; tsc clean.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
767b64cdd4
commit
0bf01f434e
9 changed files with 184 additions and 3 deletions
|
|
@ -123,6 +123,15 @@ scrape_configs:
|
|||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# Mana AI Service (Bun) — background Mission Runner for the AI Workbench.
|
||||
# Exposes tick stats, planner-request latencies, snapshot refresh
|
||||
# counters, and standard HTTP metrics at /metrics.
|
||||
- job_name: 'mana-ai'
|
||||
static_configs:
|
||||
- targets: ['mana-ai:3066']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# ============================================
|
||||
# GPU Server (Windows PC, LAN: 192.168.178.11)
|
||||
# ============================================
|
||||
|
|
@ -297,6 +306,8 @@ scrape_configs:
|
|||
# Upstream Pelias health, proxied through the wrapper so the
|
||||
# blackbox-exporter doesn't need host.docker.internal access.
|
||||
- http://mana-geocoding:3018/health/pelias
|
||||
# mana-ai (Mission Runner) — internal-only, no CF tunnel.
|
||||
- http://mana-ai:3066/health
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
|
|
|
|||
3
pnpm-lock.yaml
generated
3
pnpm-lock.yaml
generated
|
|
@ -3297,6 +3297,9 @@ importers:
|
|||
postgres:
|
||||
specifier: ^3.4.5
|
||||
version: 3.4.9
|
||||
prom-client:
|
||||
specifier: ^15.1.3
|
||||
version: 15.1.3
|
||||
devDependencies:
|
||||
'@types/bun':
|
||||
specifier: latest
|
||||
|
|
|
|||
|
|
@ -75,6 +75,9 @@ friendly_name() {
|
|||
mana-geocoding:*)
|
||||
name="Mana Geocoding"
|
||||
;;
|
||||
mana-ai:*)
|
||||
name="Mana AI Runner"
|
||||
;;
|
||||
mana.how/*)
|
||||
name="${name#mana.how/}"
|
||||
;;
|
||||
|
|
|
|||
|
|
@ -19,11 +19,20 @@ What works end-to-end:
|
|||
- [x] Webapp staging effect (`server-iteration-staging.ts`) picks up the synced iteration and translates each PlanStep into a local Proposal with full AI-actor attribution (missionId + iterationId + rationale). Idempotent via durable `proposalId` markers.
|
||||
- [x] **Server-side input resolvers** for plaintext tables — `db/resolvers/` with a pluggable registry + single-record LWW replay (`record-replay.ts`). `goals` resolver ships by default. Encrypted tables (notes, kontext, tasks, events, journal, …) are intentionally **not** resolved server-side; those missions depend on the foreground runner which decrypts client-side. See `resolvers/types.ts` for the privacy rationale.
|
||||
- [x] **Materialized mission snapshots** — `mana_ai.mission_snapshots` table with per-tick incremental refresh (`db/snapshot-refresh.ts`). `listDueMissions` is now a single indexed SELECT; the prior O(N changes) LWW replay stays only in `mergeAndFilter` for tests. Idempotent `migrate()` on boot creates the schema.
|
||||
- [x] **Prometheus metrics** on `/metrics` — process defaults with
|
||||
`mana_ai_` prefix + counters (`mana_ai_ticks_total`,
|
||||
`mana_ai_plans_produced_total`, `mana_ai_plans_written_back_total`,
|
||||
`mana_ai_parse_failures_total`, `mana_ai_mission_errors_total`,
|
||||
`mana_ai_snapshots_*`) and histograms (`mana_ai_tick_duration_seconds`,
|
||||
`mana_ai_planner_request_duration_seconds`,
|
||||
`mana_ai_http_request_duration_seconds`). Scraped 30s by
|
||||
`docker/prometheus/prometheus.yml`'s `mana-ai` job. `/health` is
|
||||
also blackbox-probed and surfaces on **status.mana.how** under
|
||||
"Internal" as "Mana AI Runner".
|
||||
|
||||
All roadmap items shipped. Future polish (not blockers):
|
||||
- Multi-instance deploy with advisory locks on snapshot refresh (today single-process)
|
||||
- Read-only `/internal/missions/:userId` endpoint for ops inspection
|
||||
- Metrics endpoint for Prometheus (tick latency, plans/hour, parse-failure rate)
|
||||
|
||||
## Port: 3066
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,8 @@
|
|||
"@mana/shared-ai": "workspace:*",
|
||||
"@mana/shared-hono": "workspace:*",
|
||||
"hono": "^4.7.0",
|
||||
"postgres": "^3.4.5"
|
||||
"postgres": "^3.4.5",
|
||||
"prom-client": "^15.1.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
"typescript": "^5.9.3",
|
||||
|
|
|
|||
|
|
@ -27,6 +27,17 @@ import { appendServerIteration, planToIteration } from '../db/iteration-writer';
|
|||
import { refreshSnapshots } from '../db/snapshot-refresh';
|
||||
import { PlannerClient } from '../planner/client';
|
||||
import { AI_AVAILABLE_TOOLS, AI_AVAILABLE_TOOL_NAMES } from '../planner/tools';
|
||||
import {
|
||||
ticksTotal,
|
||||
tickDuration,
|
||||
plansProducedTotal,
|
||||
plansWrittenBackTotal,
|
||||
parseFailuresTotal,
|
||||
missionErrorsTotal,
|
||||
snapshotsNewTotal,
|
||||
snapshotsUpdatedTotal,
|
||||
snapshotRowsAppliedTotal,
|
||||
} from '../metrics';
|
||||
import type { Config } from '../config';
|
||||
|
||||
export interface TickStats {
|
||||
|
|
@ -53,6 +64,8 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
|
|||
};
|
||||
}
|
||||
running = true;
|
||||
ticksTotal.inc();
|
||||
const tickEndTimer = tickDuration.startTimer();
|
||||
const errors: string[] = [];
|
||||
let dueMissionCount = 0;
|
||||
let plansProduced = 0;
|
||||
|
|
@ -65,6 +78,9 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
|
|||
// Bring the snapshot table up to date before querying it —
|
||||
// cheap incremental pass, O(new changes since last tick).
|
||||
const refresh = await refreshSnapshots(sql);
|
||||
snapshotsNewTotal.inc(refresh.newSnapshots);
|
||||
snapshotsUpdatedTotal.inc(refresh.updatedSnapshots);
|
||||
snapshotRowsAppliedTotal.inc(refresh.rowsApplied);
|
||||
if (refresh.rowsApplied > 0) {
|
||||
console.log(
|
||||
`[mana-ai tick] snapshot refresh: ${refresh.rowsApplied} rows → ${refresh.newSnapshots} new + ${refresh.updatedSnapshots} updated`
|
||||
|
|
@ -90,9 +106,11 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
|
|||
const plan = await planOneMission(m, planner, sql);
|
||||
if (plan === null) {
|
||||
parseFailures++;
|
||||
parseFailuresTotal.inc();
|
||||
continue;
|
||||
}
|
||||
plansProduced++;
|
||||
plansProducedTotal.inc();
|
||||
|
||||
const nowIso = new Date().toISOString();
|
||||
const iterationId = crypto.randomUUID();
|
||||
|
|
@ -107,6 +125,7 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
|
|||
nowIso,
|
||||
});
|
||||
plansWrittenBack++;
|
||||
plansWrittenBackTotal.inc();
|
||||
|
||||
console.log(
|
||||
`[mana-ai tick] mission=${m.id} user=${m.userId} plan=${plan.steps.length}step(s) iteration=${iterationId}`
|
||||
|
|
@ -114,6 +133,7 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
|
|||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
errors.push(`mission=${m.id}: ${msg}`);
|
||||
missionErrorsTotal.inc();
|
||||
console.error(`[mana-ai tick] mission=${m.id} run failed:`, msg);
|
||||
}
|
||||
}
|
||||
|
|
@ -123,6 +143,7 @@ export async function runTickOnce(config: Config): Promise<TickStats> {
|
|||
console.error('[mana-ai tick] scan error:', msg);
|
||||
} finally {
|
||||
running = false;
|
||||
tickEndTimer();
|
||||
}
|
||||
|
||||
return { scannedAt, dueMissionCount, plansProduced, plansWrittenBack, parseFailures, errors };
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ import { closeSql, getSql } from './db/connection';
|
|||
import { migrate } from './db/migrate';
|
||||
import { runTickOnce, startTick, stopTick, isTickRunning } from './cron/tick';
|
||||
import { serviceAuth } from './middleware/service-auth';
|
||||
import { register, httpRequestsTotal, httpRequestDuration } from './metrics';
|
||||
|
||||
const config = loadConfig();
|
||||
|
||||
|
|
@ -25,15 +26,33 @@ await migrate(getSql(config.syncDatabaseUrl));
|
|||
|
||||
const app = new Hono();
|
||||
|
||||
// HTTP instrumentation — labels by method/path/status, surfaced on /metrics.
|
||||
app.use('*', async (c, next) => {
|
||||
const start = Date.now();
|
||||
await next();
|
||||
const duration = (Date.now() - start) / 1000;
|
||||
const path = c.req.routePath || c.req.path;
|
||||
const labels = { method: c.req.method, path, status: c.res.status };
|
||||
httpRequestsTotal.inc(labels);
|
||||
httpRequestDuration.observe(labels, duration);
|
||||
});
|
||||
|
||||
app.get('/health', (c) =>
|
||||
c.json({
|
||||
ok: true,
|
||||
service: 'mana-ai',
|
||||
version: '0.1.0',
|
||||
version: '0.4.0',
|
||||
tick: { enabled: config.tickEnabled, running: isTickRunning() },
|
||||
})
|
||||
);
|
||||
|
||||
// Prometheus scrape target. Scraped by docker/prometheus/prometheus.yml's
|
||||
// `mana-ai` job every 30s.
|
||||
app.get('/metrics', async (c) => {
|
||||
c.header('Content-Type', register.contentType);
|
||||
return c.text(await register.metrics());
|
||||
});
|
||||
|
||||
// Service-to-service: manually fire a tick for CI / ops / debugging
|
||||
// without waiting for the interval.
|
||||
app.use('/internal/*', serviceAuth(config.serviceKey));
|
||||
|
|
|
|||
100
services/mana-ai/src/metrics.ts
Normal file
100
services/mana-ai/src/metrics.ts
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
/**
|
||||
* Prometheus metrics — exported on GET /metrics.
|
||||
*
|
||||
* Follows the same shape as mana-media (default metrics with a service
|
||||
* prefix, plus a handful of service-specific counters + histograms) so
|
||||
* the existing Grafana dashboards and the status.mana.how generator
|
||||
* recognise this service without special-casing.
|
||||
*
|
||||
* Metric naming: `mana_ai_*`. Underscore separator keeps Prometheus's
|
||||
* standard-compliant regex `[a-zA-Z_:][a-zA-Z0-9_:]*` happy.
|
||||
*/
|
||||
|
||||
import { Counter, Histogram, Registry, collectDefaultMetrics } from 'prom-client';
|
||||
|
||||
export const register = new Registry();
|
||||
register.setDefaultLabels({ service: 'mana-ai' });
|
||||
collectDefaultMetrics({ register, prefix: 'mana_ai_' });
|
||||
|
||||
// ── HTTP surface ──────────────────────────────────────────
|
||||
|
||||
export const httpRequestsTotal = new Counter({
|
||||
name: 'mana_ai_http_requests_total',
|
||||
help: 'Total HTTP requests received.',
|
||||
labelNames: ['method', 'path', 'status'] as const,
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
export const httpRequestDuration = new Histogram({
|
||||
name: 'mana_ai_http_request_duration_seconds',
|
||||
help: 'Latency per HTTP request.',
|
||||
labelNames: ['method', 'path', 'status'] as const,
|
||||
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
// ── Mission runner — service-specific ─────────────────────
|
||||
|
||||
export const ticksTotal = new Counter({
|
||||
name: 'mana_ai_ticks_total',
|
||||
help: 'Total tick loop runs (all completions, including empty ones).',
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
export const tickDuration = new Histogram({
|
||||
name: 'mana_ai_tick_duration_seconds',
|
||||
help: 'Wall-clock time spent inside one tick pass.',
|
||||
buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120],
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
export const plansProducedTotal = new Counter({
|
||||
name: 'mana_ai_plans_produced_total',
|
||||
help: 'Total plans the Planner returned parseable output for.',
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
export const plansWrittenBackTotal = new Counter({
|
||||
name: 'mana_ai_plans_written_back_total',
|
||||
help: 'Total plans persisted as server iterations on aiMissions.',
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
export const parseFailuresTotal = new Counter({
|
||||
name: 'mana_ai_parse_failures_total',
|
||||
help: 'Planner responses that failed JSON / shape validation.',
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
export const missionErrorsTotal = new Counter({
|
||||
name: 'mana_ai_mission_errors_total',
|
||||
help: 'Errors thrown while processing a single mission within a tick.',
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
export const plannerLatency = new Histogram({
|
||||
name: 'mana_ai_planner_request_duration_seconds',
|
||||
help: 'Latency of calls to the mana-llm backend.',
|
||||
buckets: [0.25, 0.5, 1, 2, 5, 10, 30, 60],
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
// ── Snapshot refresh ──────────────────────────────────────
|
||||
|
||||
export const snapshotsNewTotal = new Counter({
|
||||
name: 'mana_ai_snapshots_new_total',
|
||||
help: 'Mission-snapshot rows created on first sighting.',
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
export const snapshotsUpdatedTotal = new Counter({
|
||||
name: 'mana_ai_snapshots_updated_total',
|
||||
help: 'Mission-snapshot rows updated with a delta.',
|
||||
registers: [register],
|
||||
});
|
||||
|
||||
export const snapshotRowsAppliedTotal = new Counter({
|
||||
name: 'mana_ai_snapshot_rows_applied_total',
|
||||
help: 'Sync-changes rows folded into the snapshot cache.',
|
||||
registers: [register],
|
||||
});
|
||||
|
|
@ -7,6 +7,8 @@
|
|||
* webapp as source of truth for now while the service matures.
|
||||
*/
|
||||
|
||||
import { plannerLatency } from '../metrics';
|
||||
|
||||
export interface PlannerMessages {
|
||||
system: string;
|
||||
user: string;
|
||||
|
|
@ -26,6 +28,18 @@ export class PlannerClient {
|
|||
async complete(
|
||||
messages: PlannerMessages,
|
||||
opts: { model?: string; temperature?: number } = {}
|
||||
): Promise<PlannerResult> {
|
||||
const endTimer = plannerLatency.startTimer();
|
||||
try {
|
||||
return await this.doComplete(messages, opts);
|
||||
} finally {
|
||||
endTimer();
|
||||
}
|
||||
}
|
||||
|
||||
private async doComplete(
|
||||
messages: PlannerMessages,
|
||||
opts: { model?: string; temperature?: number }
|
||||
): Promise<PlannerResult> {
|
||||
const res = await fetch(`${this.baseUrl}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue