feat(observability): add metrics and monitoring for all 15 backends

- Add MetricsModule to 8 backends missing it (photos, zitare, mukke,
  planta, picture, storage, presi, nutriphi)
- Enable Prometheus scraping for all 15 backends in prometheus.yml
  (was only 6, with 3 commented out and 6 missing entirely)
- Update ServiceDown alert rule to cover all 15 backends
- Update Grafana dashboards (backends, master-overview, system-overview)
  with all backend services in health panels
- Fix imprecise regex in application-details dashboard

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-03-23 09:09:04 +01:00
parent 073c216652
commit 6fa6509fa5
23 changed files with 690 additions and 472 deletions

View file

@ -850,14 +850,14 @@
"allValue": ".*",
"current": { "selected": true, "text": "All", "value": "$__all" },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"definition": "label_values(up{job=~\".*backend|mana-core-auth\"}, job)",
"definition": "label_values(up{job=~\".*-backend|mana-core-auth\"}, job)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "service",
"options": [],
"query": {
"query": "label_values(up{job=~\".*backend|mana-core-auth\"}, job)",
"query": "label_values(up{job=~\".*-backend|mana-core-auth\"}, job)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 2,

View file

@ -58,7 +58,7 @@
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=~\"mana-core-auth|chat-backend|todo-backend|calendar-backend|clock-backend|contacts-backend\"}",
"expr": "up{job=~\"mana-core-auth|chat-backend|todo-backend|calendar-backend|clock-backend|contacts-backend|storage-backend|presi-backend|nutriphi-backend|skilltree-backend|photos-backend|zitare-backend|mukke-backend|planta-backend|picture-backend\"}",
"legendFormat": "{{job}}",
"refId": "A"
}

View file

@ -444,6 +444,60 @@
"legendFormat": "Contacts",
"refId": "F"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"storage-backend\"}",
"legendFormat": "Storage",
"refId": "J"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"presi-backend\"}",
"legendFormat": "Presi",
"refId": "K"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"nutriphi-backend\"}",
"legendFormat": "NutriPhi",
"refId": "L"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"skilltree-backend\"}",
"legendFormat": "SkillTree",
"refId": "M"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"photos-backend\"}",
"legendFormat": "Photos",
"refId": "N"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"zitare-backend\"}",
"legendFormat": "Zitare",
"refId": "O"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"mukke-backend\"}",
"legendFormat": "Mukke",
"refId": "P"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"planta-backend\"}",
"legendFormat": "Planta",
"refId": "Q"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"picture-backend\"}",
"legendFormat": "Picture",
"refId": "R"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "pg_up",

View file

@ -528,6 +528,60 @@
"legendFormat": "Contacts",
"refId": "F"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"storage-backend\"}",
"legendFormat": "Storage",
"refId": "J"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"presi-backend\"}",
"legendFormat": "Presi",
"refId": "K"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"nutriphi-backend\"}",
"legendFormat": "NutriPhi",
"refId": "L"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"skilltree-backend\"}",
"legendFormat": "SkillTree",
"refId": "M"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"photos-backend\"}",
"legendFormat": "Photos",
"refId": "N"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"zitare-backend\"}",
"legendFormat": "Zitare",
"refId": "O"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"mukke-backend\"}",
"legendFormat": "Mukke",
"refId": "P"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"planta-backend\"}",
"legendFormat": "Planta",
"refId": "Q"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=\"picture-backend\"}",
"legendFormat": "Picture",
"refId": "R"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "pg_up",

View file

@ -3,7 +3,7 @@ groups:
rules:
# Service Down Alert
- alert: ServiceDown
expr: up{job=~"mana-core-auth|chat-backend|todo-backend|calendar-backend|clock-backend|contacts-backend"} == 0
expr: up{job=~"mana-core-auth|chat-backend|todo-backend|calendar-backend|clock-backend|contacts-backend|storage-backend|presi-backend|nutriphi-backend|skilltree-backend|photos-backend|zitare-backend|mukke-backend|planta-backend|picture-backend"} == 0
for: 1m
labels:
severity: critical

View file

@ -91,26 +91,68 @@ scrape_configs:
metrics_path: '/metrics'
scrape_interval: 30s
# Storage Backend (disabled - no /metrics endpoint yet)
# - job_name: 'storage-backend'
# static_configs:
# - targets: ['storage-backend:3035']
# metrics_path: '/metrics'
# scrape_interval: 30s
# Storage Backend
- job_name: 'storage-backend'
static_configs:
- targets: ['storage-backend:3035']
metrics_path: '/metrics'
scrape_interval: 30s
# Presi Backend (disabled - no /metrics endpoint yet)
# - job_name: 'presi-backend'
# static_configs:
# - targets: ['presi-backend:3036']
# metrics_path: '/metrics'
# scrape_interval: 30s
# Presi Backend
- job_name: 'presi-backend'
static_configs:
- targets: ['presi-backend:3036']
metrics_path: '/metrics'
scrape_interval: 30s
# Nutriphi Backend (disabled - no /metrics endpoint yet)
# - job_name: 'nutriphi-backend'
# static_configs:
# - targets: ['nutriphi-backend:3037']
# metrics_path: '/metrics'
# scrape_interval: 30s
# Nutriphi Backend
- job_name: 'nutriphi-backend'
static_configs:
- targets: ['nutriphi-backend:3037']
metrics_path: '/metrics'
scrape_interval: 30s
# SkillTree Backend
- job_name: 'skilltree-backend'
static_configs:
- targets: ['skilltree-backend:3038']
metrics_path: '/metrics'
scrape_interval: 30s
# Photos Backend
- job_name: 'photos-backend'
static_configs:
- targets: ['photos-backend:3039']
metrics_path: '/metrics'
scrape_interval: 30s
# Zitare Backend
- job_name: 'zitare-backend'
static_configs:
- targets: ['zitare-backend:3007']
metrics_path: '/metrics'
scrape_interval: 30s
# Mukke Backend
- job_name: 'mukke-backend'
static_configs:
- targets: ['mukke-backend:3010']
metrics_path: '/metrics'
scrape_interval: 30s
# Planta Backend
- job_name: 'planta-backend'
static_configs:
- targets: ['planta-backend:3022']
metrics_path: '/metrics'
scrape_interval: 30s
# Picture Backend
- job_name: 'picture-backend'
static_configs:
- targets: ['picture-backend:3040']
metrics_path: '/metrics'
scrape_interval: 30s
# ============================================
# Pushgateway (deploy metrics, batch jobs)