feat(observability): add metrics and monitoring for all 15 backends

- Add MetricsModule to 8 backends missing it (photos, zitare, mukke,
  planta, picture, storage, presi, nutriphi)
- Enable Prometheus scraping for all 15 backends in prometheus.yml
  (was only 6, with 3 commented out and 6 missing entirely)
- Update ServiceDown alert rule to cover all 15 backends
- Update Grafana dashboards (backends, master-overview, system-overview)
  with all backend services in health panels
- Fix imprecise regex in application-details dashboard

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-03-23 09:09:04 +01:00
parent 073c216652
commit 6fa6509fa5
23 changed files with 690 additions and 472 deletions

View file

@ -3,7 +3,7 @@ groups:
rules:
# Service Down Alert
- alert: ServiceDown
expr: up{job=~"mana-core-auth|chat-backend|todo-backend|calendar-backend|clock-backend|contacts-backend"} == 0
expr: up{job=~"mana-core-auth|chat-backend|todo-backend|calendar-backend|clock-backend|contacts-backend|storage-backend|presi-backend|nutriphi-backend|skilltree-backend|photos-backend|zitare-backend|mukke-backend|planta-backend|picture-backend"} == 0
for: 1m
labels:
severity: critical

View file

@ -91,26 +91,68 @@ scrape_configs:
metrics_path: '/metrics'
scrape_interval: 30s
# Storage Backend (disabled - no /metrics endpoint yet)
# - job_name: 'storage-backend'
# static_configs:
# - targets: ['storage-backend:3035']
# metrics_path: '/metrics'
# scrape_interval: 30s
# Storage Backend
- job_name: 'storage-backend'
static_configs:
- targets: ['storage-backend:3035']
metrics_path: '/metrics'
scrape_interval: 30s
# Presi Backend (disabled - no /metrics endpoint yet)
# - job_name: 'presi-backend'
# static_configs:
# - targets: ['presi-backend:3036']
# metrics_path: '/metrics'
# scrape_interval: 30s
# Presi Backend
- job_name: 'presi-backend'
static_configs:
- targets: ['presi-backend:3036']
metrics_path: '/metrics'
scrape_interval: 30s
# Nutriphi Backend (disabled - no /metrics endpoint yet)
# - job_name: 'nutriphi-backend'
# static_configs:
# - targets: ['nutriphi-backend:3037']
# metrics_path: '/metrics'
# scrape_interval: 30s
# Nutriphi Backend
- job_name: 'nutriphi-backend'
static_configs:
- targets: ['nutriphi-backend:3037']
metrics_path: '/metrics'
scrape_interval: 30s
# SkillTree Backend
- job_name: 'skilltree-backend'
static_configs:
- targets: ['skilltree-backend:3038']
metrics_path: '/metrics'
scrape_interval: 30s
# Photos Backend
- job_name: 'photos-backend'
static_configs:
- targets: ['photos-backend:3039']
metrics_path: '/metrics'
scrape_interval: 30s
# Zitare Backend
- job_name: 'zitare-backend'
static_configs:
- targets: ['zitare-backend:3007']
metrics_path: '/metrics'
scrape_interval: 30s
# Mukke Backend
- job_name: 'mukke-backend'
static_configs:
- targets: ['mukke-backend:3010']
metrics_path: '/metrics'
scrape_interval: 30s
# Planta Backend
- job_name: 'planta-backend'
static_configs:
- targets: ['planta-backend:3022']
metrics_path: '/metrics'
scrape_interval: 30s
# Picture Backend
- job_name: 'picture-backend'
static_configs:
- targets: ['picture-backend:3040']
metrics_path: '/metrics'
scrape_interval: 30s
# ============================================
# Pushgateway (deploy metrics, batch jobs)