mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 20:21:09 +02:00
Wire mana-llm service into the monitoring stack: Prometheus (docker/prometheus/prometheus.yml): - Add mana-llm scrape job (port 3025, 15s interval) - Include mana-llm in ServiceDown alert expression Alerts (docker/prometheus/alerts.yml): - New llm_alerts group with 4 rules: - LLMServiceDown: mana-llm down > 1 min (critical) - LLMHighErrorRate: > 10% errors for 5 min (warning) - OllamaProviderDown: > 50% requests via Google fallback (warning) - LLMSlowResponses: p95 > 30s for 5 min (warning) Grafana Dashboard (docker/grafana/dashboards/mana-llm.json): - 6 stat panels: status, req/min, error rate, fallback rate, latency, tokens/min - Requests by Provider (stacked area: Ollama vs Google vs OpenRouter) - Tokens by Type (prompt vs completion) - Latency Percentiles (p50, p90, p99) - Latency by Provider comparison - Requests by Model breakdown - Errors by Type - Google Fallback Rate over time (with threshold coloring) - Provider Distribution pie chart (24h) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
195 lines
4.6 KiB
YAML
195 lines
4.6 KiB
YAML
# ManaCore Prometheus Configuration
|
|
# Scrapes metrics from all services
|
|
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
# Load alerting rules
|
|
rule_files:
|
|
- /etc/prometheus/alerts.yml
|
|
|
|
# Alertmanager configuration
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets: ['alertmanager:9093']
|
|
|
|
scrape_configs:
|
|
# Prometheus self-monitoring
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
|
|
# Host system metrics via node-exporter
|
|
- job_name: 'node'
|
|
static_configs:
|
|
- targets: ['node-exporter:9100']
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: instance
|
|
replacement: 'mac-mini'
|
|
|
|
# Docker container metrics via cAdvisor
|
|
- job_name: 'cadvisor'
|
|
static_configs:
|
|
- targets: ['cadvisor:8080']
|
|
|
|
# PostgreSQL metrics
|
|
- job_name: 'postgres'
|
|
static_configs:
|
|
- targets: ['postgres-exporter:9187']
|
|
|
|
# Redis metrics
|
|
- job_name: 'redis'
|
|
static_configs:
|
|
- targets: ['redis-exporter:9121']
|
|
|
|
# ============================================
|
|
# Application Backends (after /metrics added)
|
|
# ============================================
|
|
|
|
# Auth Service
|
|
- job_name: 'mana-core-auth'
|
|
static_configs:
|
|
- targets: ['mana-core-auth:3001']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Chat Backend
|
|
- job_name: 'chat-backend'
|
|
static_configs:
|
|
- targets: ['chat-backend:3030']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Todo Backend
|
|
- job_name: 'todo-backend'
|
|
static_configs:
|
|
- targets: ['todo-backend:3031']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Calendar Backend
|
|
- job_name: 'calendar-backend'
|
|
static_configs:
|
|
- targets: ['calendar-backend:3032']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Clock Backend
|
|
- job_name: 'clock-backend'
|
|
static_configs:
|
|
- targets: ['clock-backend:3033']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Contacts Backend
|
|
- job_name: 'contacts-backend'
|
|
static_configs:
|
|
- targets: ['contacts-backend:3034']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Storage Backend
|
|
- job_name: 'storage-backend'
|
|
static_configs:
|
|
- targets: ['storage-backend:3035']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Presi Backend
|
|
- job_name: 'presi-backend'
|
|
static_configs:
|
|
- targets: ['presi-backend:3036']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Nutriphi Backend
|
|
- job_name: 'nutriphi-backend'
|
|
static_configs:
|
|
- targets: ['nutriphi-backend:3037']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# SkillTree Backend
|
|
- job_name: 'skilltree-backend'
|
|
static_configs:
|
|
- targets: ['skilltree-backend:3038']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Photos Backend
|
|
- job_name: 'photos-backend'
|
|
static_configs:
|
|
- targets: ['photos-backend:3039']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Zitare Backend
|
|
- job_name: 'zitare-backend'
|
|
static_configs:
|
|
- targets: ['zitare-backend:3007']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Mukke Backend
|
|
- job_name: 'mukke-backend'
|
|
static_configs:
|
|
- targets: ['mukke-backend:3010']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Planta Backend
|
|
- job_name: 'planta-backend'
|
|
static_configs:
|
|
- targets: ['planta-backend:3022']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Picture Backend
|
|
- job_name: 'picture-backend'
|
|
static_configs:
|
|
- targets: ['picture-backend:3040']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# ============================================
|
|
# Core Services
|
|
# ============================================
|
|
|
|
# Mana LLM Gateway (Ollama + Google Fallback)
|
|
- job_name: 'mana-llm'
|
|
static_configs:
|
|
- targets: ['mana-llm:3025']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 15s
|
|
|
|
# Mana Search Service
|
|
- job_name: 'mana-search'
|
|
static_configs:
|
|
- targets: ['mana-search:3020']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Mana Media Service
|
|
- job_name: 'mana-media'
|
|
static_configs:
|
|
- targets: ['mana-media:3015']
|
|
metrics_path: '/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# Matrix Synapse
|
|
- job_name: 'synapse'
|
|
static_configs:
|
|
- targets: ['synapse:9002']
|
|
metrics_path: '/_synapse/metrics'
|
|
scrape_interval: 30s
|
|
|
|
# ============================================
|
|
# Pushgateway (deploy metrics, batch jobs)
|
|
# ============================================
|
|
- job_name: 'pushgateway'
|
|
honor_labels: true
|
|
static_configs:
|
|
- targets: ['pushgateway:9091']
|