feat(monitoring): add LLM Grafana dashboard, Prometheus scraping, and alerts

Wire mana-llm service into the monitoring stack:

Prometheus (docker/prometheus/prometheus.yml):
- Add mana-llm scrape job (port 3025, 15s interval)
- Include mana-llm in ServiceDown alert expression

Alerts (docker/prometheus/alerts.yml):
- New llm_alerts group with 4 rules:
  - LLMServiceDown: mana-llm down > 1 min (critical)
  - LLMHighErrorRate: > 10% errors for 5 min (warning)
  - OllamaProviderDown: > 50% requests via Google fallback (warning)
  - LLMSlowResponses: p95 > 30s for 5 min (warning)

Grafana Dashboard (docker/grafana/dashboards/mana-llm.json):
- 6 stat panels: status, req/min, error rate, fallback rate, latency, tokens/min
- Requests by Provider (stacked area: Ollama vs Google vs OpenRouter)
- Tokens by Type (prompt vs completion)
- Latency Percentiles (p50, p90, p99)
- Latency by Provider comparison
- Requests by Model breakdown
- Errors by Type
- Google Fallback Rate over time (with threshold coloring)
- Provider Distribution pie chart (24h)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-03-24 11:16:27 +01:00
parent 57a2841168
commit 169821de1a
3 changed files with 477 additions and 1 deletions

View file

@ -0,0 +1,422 @@
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [],
"targetBlank": false,
"title": "Master Overview",
"type": "link",
"url": "/d/master-overview/master-overview"
}
],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 1,
"panels": [],
"title": "Service Health",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{
"options": {
"0": { "color": "red", "index": 1, "text": "DOWN" },
"1": { "color": "green", "index": 0, "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"title": "mana-llm Status",
"type": "stat",
"targets": [{ "expr": "up{job=\"mana-llm\"}", "legendFormat": "Status" }]
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"title": "Requests / min",
"type": "stat",
"targets": [
{ "expr": "sum(rate(mana_llm_llm_requests_total[5m])) * 60", "legendFormat": "req/min" }
]
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 5 },
{ "color": "red", "value": 20 }
]
}
}
},
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"title": "Error Rate",
"type": "stat",
"targets": [
{
"expr": "sum(rate(mana_llm_llm_errors_total[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
"legendFormat": "errors"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 10 },
{ "color": "orange", "value": 30 },
{ "color": "red", "value": 50 }
]
}
}
},
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"title": "Google Fallback Rate",
"type": "stat",
"targets": [
{
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
"legendFormat": "fallback %"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 5 },
{ "color": "red", "value": 30 }
]
}
}
},
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "area",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"title": "Avg Latency (p50)",
"type": "stat",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
"legendFormat": "p50"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
"id": 7,
"options": {
"colorMode": "value",
"graphMode": "area",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"title": "Tokens / min",
"type": "stat",
"targets": [
{ "expr": "sum(rate(mana_llm_tokens_total[5m])) * 60", "legendFormat": "tok/min" }
]
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 10,
"panels": [],
"title": "Request Traffic",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } },
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
"id": 11,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] }
},
"title": "LLM Requests by Provider",
"type": "timeseries",
"targets": [
{
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"ollama\"}[5m])) by (provider)",
"legendFormat": "Ollama"
},
{
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) by (provider)",
"legendFormat": "Google (Fallback)"
},
{
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"openrouter\"}[5m])) by (provider)",
"legendFormat": "OpenRouter"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } },
"unit": "short"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
"id": 12,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] }
},
"title": "Tokens by Type",
"type": "timeseries",
"targets": [
{
"expr": "sum(rate(mana_llm_tokens_total{type=\"prompt\"}[5m])) * 60",
"legendFormat": "Prompt Tokens/min"
},
{
"expr": "sum(rate(mana_llm_tokens_total{type=\"completion\"}[5m])) * 60",
"legendFormat": "Completion Tokens/min"
}
]
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 20,
"panels": [],
"title": "Latency & Performance",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "s", "custom": { "fillOpacity": 10 } } },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
"id": 21,
"options": { "legend": { "displayMode": "table", "placement": "bottom" } },
"title": "LLM Latency Percentiles",
"type": "timeseries",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.90, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
"legendFormat": "p90"
},
{
"expr": "histogram_quantile(0.99, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
"legendFormat": "p99"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": { "unit": "s", "custom": { "fillOpacity": 30, "stacking": { "mode": "none" } } }
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
"id": 22,
"options": { "legend": { "displayMode": "table", "placement": "bottom" } },
"title": "Latency by Provider (p50)",
"type": "timeseries",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"ollama\"}[5m])) by (le))",
"legendFormat": "Ollama p50"
},
{
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"google\"}[5m])) by (le))",
"legendFormat": "Google p50"
}
]
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
"id": 30,
"panels": [],
"title": "Models & Errors",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "reqps", "custom": { "fillOpacity": 30 } } },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
"id": 31,
"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
"title": "Requests by Model",
"type": "timeseries",
"targets": [
{
"expr": "sum(rate(mana_llm_llm_requests_total[5m])) by (model)",
"legendFormat": "{{ model }}"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 30 } } },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
"id": 32,
"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
"title": "Errors by Type",
"type": "timeseries",
"targets": [
{
"expr": "sum(rate(mana_llm_llm_errors_total[5m])) by (error_type)",
"legendFormat": "{{ error_type }}"
}
]
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 },
"id": 40,
"panels": [],
"title": "Fallback Analysis",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"custom": { "fillOpacity": 20 },
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 10 },
{ "color": "red", "value": 50 }
]
}
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 },
"id": 41,
"title": "Google Fallback Rate Over Time",
"type": "timeseries",
"targets": [
{
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
"legendFormat": "Fallback %"
}
]
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "short" } },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 },
"id": 42,
"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
"title": "Provider Request Distribution (Pie)",
"type": "piechart",
"targets": [
{
"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"ollama\"}[24h]))",
"legendFormat": "Ollama"
},
{
"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"google\"}[24h]))",
"legendFormat": "Google"
},
{
"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"openrouter\"}[24h]))",
"legendFormat": "OpenRouter"
}
]
}
],
"schemaVersion": 39,
"tags": ["mana", "llm", "ai"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "VictoriaMetrics", "value": "victoria-metrics" },
"hide": 0,
"includeAll": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"type": "datasource"
}
]
},
"time": { "from": "now-6h", "to": "now" },
"title": "Mana LLM Gateway",
"uid": "mana-llm",
"version": 1
}

View file

@ -3,7 +3,7 @@ groups:
rules:
# Service Down Alert
- alert: ServiceDown
expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|synapse"} == 0
expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|mana-llm|synapse"} == 0
for: 1m
labels:
severity: critical
@ -356,3 +356,50 @@ groups:
annotations:
summary: "OIDC token endpoint errors"
description: "OIDC token endpoint is returning 5xx errors. SSO may be affected."
- name: llm_alerts
rules:
# mana-llm Down
- alert: LLMServiceDown
expr: up{job="mana-llm"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "mana-llm service is down"
description: "mana-llm has been down for more than 1 minute. All AI features will fail."
# High LLM Error Rate (> 10%)
- alert: LLMHighErrorRate
expr: |
sum(rate(mana_llm_llm_errors_total[5m]))
/ (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High LLM error rate"
description: "{{ $value | humanizePercentage }} of LLM requests are failing."
# Ollama Provider Down (all requests going to fallback)
- alert: OllamaProviderDown
expr: |
sum(rate(mana_llm_llm_requests_total{provider="google"}[5m]))
/ (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.5
for: 10m
labels:
severity: warning
annotations:
summary: "Ollama appears down — most requests going to Google fallback"
description: "{{ $value | humanizePercentage }} of LLM requests are using Google Gemini fallback."
# LLM Slow Responses (p95 > 30s)
- alert: LLMSlowResponses
expr: |
histogram_quantile(0.95, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le)) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "LLM responses are slow"
description: "LLM p95 latency is {{ $value | humanizeDuration }}."

View file

@ -158,6 +158,13 @@ scrape_configs:
# Core Services
# ============================================
# Mana LLM Gateway (Ollama + Google Fallback)
- job_name: 'mana-llm'
static_configs:
- targets: ['mana-llm:3025']
metrics_path: '/metrics'
scrape_interval: 15s
# Mana Search Service
- job_name: 'mana-search'
static_configs: