mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-15 01:41:08 +02:00
Wires mana-ai into the existing observability stack so tick throughput,
plan-failure rates, planner latencies, and snapshot refresh health are
visible in Grafana + Prometheus, and the service's uptime surfaces on
status.mana.how under the "Internal" section.
- `src/metrics.ts` — prom-client Registry with `mana_ai_` prefix.
Counters: ticks_total, plans_produced_total, plans_written_back_total,
parse_failures_total, mission_errors_total, snapshots_new/updated,
snapshot_rows_applied_total, http_requests_total.
Histograms: tick_duration_seconds (0.1–120s), planner_request_
duration_seconds (0.25–60s), http_request_duration_seconds (0.005–10s).
- `src/index.ts` — HTTP middleware labels every request by
method/path/status; `/metrics` serves the Prometheus text format.
- `src/cron/tick.ts` — increments counters + wraps the tick with
`tickDuration.startTimer()`. Snapshot stats fold through.
- `src/planner/client.ts` — wraps `complete()` in a latency histogram
timer so planner tail latency shows up separately from tick duration.
- `docker/prometheus/prometheus.yml` —
1. New `mana-ai` scrape job against `mana-ai:3066/metrics` (30s).
2. `/health` added to the `blackbox-internal` job so uptime shows on
status.mana.how alongside mana-geocoding.
- `scripts/generate-status-page.sh` — friendly label for the new probe:
`mana-ai:3066/health` → "Mana AI Runner" (generator already iterates
`blackbox-internal`, no other changes needed).
- `package.json` — prom-client ^15.1.3
All 17 Bun tests still pass; tsc clean.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
382 lines
11 KiB
YAML
382 lines
11 KiB
YAML
# Mana Prometheus Configuration
|
|
# Scrapes metrics from all services
|
|
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
# Load alerting rules
|
|
rule_files:
|
|
- /etc/prometheus/alerts.yml
|
|
|
|
# Alertmanager configuration
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets: ['alertmanager:9093']
|
|
|
|
scrape_configs:
|
|
# Prometheus self-monitoring
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
|
|
# Host system metrics via node-exporter
|
|
- job_name: 'node'
|
|
static_configs:
|
|
- targets: ['node-exporter:9100']
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: instance
|
|
replacement: 'mac-mini'
|
|
|
|
# Docker container metrics via cAdvisor
|
|
- job_name: 'cadvisor'
|
|
static_configs:
|
|
- targets: ['cadvisor:8080']
|
|
|
|
# PostgreSQL metrics
|
|
- job_name: 'postgres'
|
|
static_configs:
|
|
- targets: ['postgres-exporter:9187']
|
|
|
|
# Redis metrics
|
|
- job_name: 'redis'
|
|
static_configs:
|
|
- targets: ['redis-exporter:9121']
|
|
|
|
# ============================================
|
|
# Core Services (Hono/Bun + Go)
|
|
# ============================================
|
|
|
|
# Auth Service
|
|
- job_name: 'mana-auth'
|
|
static_configs:
|
|
- targets: ['mana-auth:3001']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Credits Service
|
|
- job_name: 'mana-credits'
|
|
static_configs:
|
|
- targets: ['mana-credits:3002']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# User Service
|
|
- job_name: 'mana-user'
|
|
static_configs:
|
|
- targets: ['mana-user:3062']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Subscriptions Service
|
|
- job_name: 'mana-subscriptions'
|
|
static_configs:
|
|
- targets: ['mana-subscriptions:3063']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Analytics Service
|
|
- job_name: 'mana-analytics'
|
|
static_configs:
|
|
- targets: ['mana-analytics:3064']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# ULoad Server
|
|
- job_name: 'uload-server'
|
|
static_configs:
|
|
- targets: ['mana-app-uload-server:3070']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Memoro Server
|
|
- job_name: 'memoro-server'
|
|
static_configs:
|
|
- targets: ['mana-app-memoro-server:3015']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# NOTE: Individual app backends (chat, todo, calendar, contacts, storage,
|
|
# food, music, plants, picture) have been REMOVED — all migrated to
|
|
# local-first architecture. Only uload-server and memoro-server remain.
|
|
|
|
# Mana LLM Gateway (Ollama + Google Fallback)
|
|
- job_name: 'mana-llm'
|
|
static_configs:
|
|
- targets: ['mana-llm:3020']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 15s
|
|
|
|
# Mana Search Service
|
|
- job_name: 'mana-search'
|
|
static_configs:
|
|
- targets: ['mana-search:3012']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Mana Media Service
|
|
- job_name: 'mana-media'
|
|
static_configs:
|
|
- targets: ['mana-media:3011']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Mana AI Service (Bun) — background Mission Runner for the AI Workbench.
|
|
# Exposes tick stats, planner-request latencies, snapshot refresh
|
|
# counters, and standard HTTP metrics at /metrics.
|
|
- job_name: 'mana-ai'
|
|
static_configs:
|
|
- targets: ['mana-ai:3066']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# ============================================
|
|
# GPU Server (Windows PC, LAN: 192.168.178.11)
|
|
# ============================================
|
|
|
|
# GPU: LLM Gateway
|
|
- job_name: 'gpu-llm'
|
|
static_configs:
|
|
- targets: ['192.168.178.11:3025']
|
|
labels:
|
|
instance: 'gpu-server'
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 15s
|
|
|
|
# GPU: Speech-to-Text (WhisperX)
|
|
- job_name: 'gpu-stt'
|
|
static_configs:
|
|
- targets: ['192.168.178.11:3020']
|
|
labels:
|
|
instance: 'gpu-server'
|
|
metrics_path: '/health'
|
|
scrape_interval: 30s
|
|
|
|
# GPU: Text-to-Speech
|
|
- job_name: 'gpu-tts'
|
|
static_configs:
|
|
- targets: ['192.168.178.11:3022']
|
|
labels:
|
|
instance: 'gpu-server'
|
|
metrics_path: '/health'
|
|
scrape_interval: 30s
|
|
|
|
# GPU: Image Generation (FLUX.2)
|
|
- job_name: 'gpu-image-gen'
|
|
static_configs:
|
|
- targets: ['192.168.178.11:3023']
|
|
labels:
|
|
instance: 'gpu-server'
|
|
metrics_path: '/health'
|
|
scrape_interval: 30s
|
|
|
|
# GPU: Video Generation (LTX-Video)
|
|
- job_name: 'gpu-video-gen'
|
|
static_configs:
|
|
- targets: ['192.168.178.11:3026']
|
|
labels:
|
|
instance: 'gpu-server'
|
|
metrics_path: '/health'
|
|
scrape_interval: 30s
|
|
|
|
# ============================================
|
|
# Go Infrastructure Services
|
|
# ============================================
|
|
|
|
# API Gateway (Go)
|
|
- job_name: 'mana-api-gateway'
|
|
static_configs:
|
|
- targets: ['mana-api-gateway:3016']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 15s
|
|
|
|
# Sync Server (Go) — local-first data sync
|
|
- job_name: 'mana-sync'
|
|
static_configs:
|
|
- targets: ['mana-core-sync:3051']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Notification Service (Go) — email, push, webhook
|
|
- job_name: 'mana-notify'
|
|
static_configs:
|
|
- targets: ['mana-core-notify:3013']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Crawler Service (Go)
|
|
- job_name: 'mana-crawler'
|
|
static_configs:
|
|
- targets: ['mana-crawler:3014']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# ============================================
|
|
# Blackbox Exporter — HTTP Uptime Probes
|
|
# ============================================
|
|
|
|
# Web Apps (Unified Mana app at mana.how + standalone games)
|
|
- job_name: 'blackbox-web'
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
static_configs:
|
|
- targets:
|
|
# Unified Mana app (all modules as routes)
|
|
- https://mana.how
|
|
- https://mana.how/chat
|
|
- https://mana.how/todo
|
|
- https://mana.how/calendar
|
|
- https://mana.how/contacts
|
|
- https://mana.how/times
|
|
- https://mana.how/photos
|
|
- https://mana.how/picture
|
|
- https://mana.how/storage
|
|
- https://mana.how/presi
|
|
- https://mana.how/food
|
|
- https://mana.how/plants
|
|
- https://mana.how/calc
|
|
- https://mana.how/quotes
|
|
- https://mana.how/cards
|
|
- https://mana.how/skilltree
|
|
- https://mana.how/music
|
|
- https://mana.how/citycorners
|
|
- https://mana.how/memoro
|
|
- https://mana.how/moodlit
|
|
- https://mana.how/context
|
|
- https://mana.how/questions
|
|
- https://mana.how/uload
|
|
- https://mana.how/notes
|
|
- https://mana.how/habits
|
|
- https://mana.how/guides
|
|
- https://mana.how/inventory
|
|
- https://mana.how/body
|
|
- https://mana.how/journal
|
|
- https://mana.how/dreams
|
|
- https://mana.how/firsts
|
|
- https://mana.how/period
|
|
- https://mana.how/events
|
|
- https://mana.how/finance
|
|
- https://mana.how/places
|
|
- https://mana.how/who
|
|
- https://mana.how/news
|
|
- https://mana.how/mail
|
|
- https://mana.how/playground
|
|
# Standalone games (separate containers)
|
|
- https://whopxl.mana.how
|
|
- https://arcade.mana.how
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter:9115
|
|
|
|
# API Health Endpoints (only services with running containers)
|
|
- job_name: 'blackbox-api'
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_health]
|
|
static_configs:
|
|
- targets:
|
|
- https://auth.mana.how/health
|
|
- https://api.mana.how/health
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter:9115
|
|
|
|
# Internal-only services (not exposed via Cloudflare).
|
|
# Probed over the Docker network so the blackbox exporter reaches
|
|
# them by container name.
|
|
- job_name: 'blackbox-internal'
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
static_configs:
|
|
- targets:
|
|
# mana-geocoding's own health (Hono wrapper)
|
|
- http://mana-geocoding:3018/health
|
|
# Upstream Pelias health, proxied through the wrapper so the
|
|
# blackbox-exporter doesn't need host.docker.internal access.
|
|
- http://mana-geocoding:3018/health/pelias
|
|
# mana-ai (Mission Runner) — internal-only, no CF tunnel.
|
|
- http://mana-ai:3066/health
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter:9115
|
|
|
|
# Infrastructure & Monitoring Tools
|
|
- job_name: 'blackbox-infra'
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
static_configs:
|
|
- targets:
|
|
- https://git.mana.how
|
|
- https://grafana.mana.how
|
|
- https://stats.mana.how
|
|
- https://glitchtip.mana.how
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter:9115
|
|
|
|
# GPU Server Services — probe /health, not /
|
|
# The GPU services (whisper STT, TTS, FLUX image gen) only return 2xx
|
|
# on /health; their root path returns 401/403/404 by design (auth or
|
|
# API-only). Ollama is the exception — its / returns 200, but it has
|
|
# no /health endpoint, so we keep it on / via a separate target.
|
|
- job_name: 'blackbox-gpu'
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_health]
|
|
static_configs:
|
|
- targets:
|
|
- https://gpu-stt.mana.how/health
|
|
- https://gpu-tts.mana.how/health
|
|
- https://gpu-img.mana.how/health
|
|
- https://gpu-video.mana.how/health
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter:9115
|
|
|
|
- job_name: 'blackbox-gpu-root'
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
static_configs:
|
|
- targets:
|
|
- https://gpu-ollama.mana.how
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter:9115
|
|
|
|
# ============================================
|
|
# Pushgateway (deploy metrics, batch jobs)
|
|
# ============================================
|
|
- job_name: 'pushgateway'
|
|
honor_labels: true
|
|
static_configs:
|
|
- targets: ['pushgateway:9091']
|