mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 19:41:09 +02:00
feat(monitoring): add LLM Grafana dashboard, Prometheus scraping, and alerts
Wire mana-llm service into the monitoring stack: Prometheus (docker/prometheus/prometheus.yml): - Add mana-llm scrape job (port 3025, 15s interval) - Include mana-llm in ServiceDown alert expression Alerts (docker/prometheus/alerts.yml): - New llm_alerts group with 4 rules: - LLMServiceDown: mana-llm down > 1 min (critical) - LLMHighErrorRate: > 10% errors for 5 min (warning) - OllamaProviderDown: > 50% requests via Google fallback (warning) - LLMSlowResponses: p95 > 30s for 5 min (warning) Grafana Dashboard (docker/grafana/dashboards/mana-llm.json): - 6 stat panels: status, req/min, error rate, fallback rate, latency, tokens/min - Requests by Provider (stacked area: Ollama vs Google vs OpenRouter) - Tokens by Type (prompt vs completion) - Latency Percentiles (p50, p90, p99) - Latency by Provider comparison - Requests by Model breakdown - Errors by Type - Google Fallback Rate over time (with threshold coloring) - Provider Distribution pie chart (24h) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
57a2841168
commit
169821de1a
3 changed files with 477 additions and 1 deletions
422
docker/grafana/dashboards/mana-llm.json
Normal file
422
docker/grafana/dashboards/mana-llm.json
Normal file
|
|
@ -0,0 +1,422 @@
|
|||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [
|
||||
{
|
||||
"asDropdown": false,
|
||||
"icon": "external link",
|
||||
"includeVars": true,
|
||||
"keepTime": true,
|
||||
"tags": [],
|
||||
"targetBlank": false,
|
||||
"title": "Master Overview",
|
||||
"type": "link",
|
||||
"url": "/d/master-overview/master-overview"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"panels": [],
|
||||
"title": "Service Health",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": { "color": "red", "index": 1, "text": "DOWN" },
|
||||
"1": { "color": "green", "index": 0, "text": "UP" }
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"title": "mana-llm Status",
|
||||
"type": "stat",
|
||||
"targets": [{ "expr": "up{job=\"mana-llm\"}", "legendFormat": "Status" }]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short" } },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"title": "Requests / min",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{ "expr": "sum(rate(mana_llm_llm_requests_total[5m])) * 60", "legendFormat": "req/min" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"title": "Error Rate",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_llm_errors_total[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
|
||||
"legendFormat": "errors"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "orange", "value": 30 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"title": "Google Fallback Rate",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
|
||||
"legendFormat": "fallback %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 30 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
|
||||
"id": 6,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"title": "Avg Latency (p50)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short" } },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"title": "Tokens / min",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{ "expr": "sum(rate(mana_llm_tokens_total[5m])) * 60", "legendFormat": "tok/min" }
|
||||
]
|
||||
},
|
||||
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
||||
"id": 10,
|
||||
"panels": [],
|
||||
"title": "Request Traffic",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } },
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] }
|
||||
},
|
||||
"title": "LLM Requests by Provider",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"ollama\"}[5m])) by (provider)",
|
||||
"legendFormat": "Ollama"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) by (provider)",
|
||||
"legendFormat": "Google (Fallback)"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"openrouter\"}[5m])) by (provider)",
|
||||
"legendFormat": "OpenRouter"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } },
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
||||
"id": 12,
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] }
|
||||
},
|
||||
"title": "Tokens by Type",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_tokens_total{type=\"prompt\"}[5m])) * 60",
|
||||
"legendFormat": "Prompt Tokens/min"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_tokens_total{type=\"completion\"}[5m])) * 60",
|
||||
"legendFormat": "Completion Tokens/min"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 20,
|
||||
"panels": [],
|
||||
"title": "Latency & Performance",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": { "defaults": { "unit": "s", "custom": { "fillOpacity": 10 } } },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
|
||||
"id": 21,
|
||||
"options": { "legend": { "displayMode": "table", "placement": "bottom" } },
|
||||
"title": "LLM Latency Percentiles",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.90, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p90"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s", "custom": { "fillOpacity": 30, "stacking": { "mode": "none" } } }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
|
||||
"id": 22,
|
||||
"options": { "legend": { "displayMode": "table", "placement": "bottom" } },
|
||||
"title": "Latency by Provider (p50)",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"ollama\"}[5m])) by (le))",
|
||||
"legendFormat": "Ollama p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"google\"}[5m])) by (le))",
|
||||
"legendFormat": "Google p50"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
|
||||
"id": 30,
|
||||
"panels": [],
|
||||
"title": "Models & Errors",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": { "defaults": { "unit": "reqps", "custom": { "fillOpacity": 30 } } },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
|
||||
"id": 31,
|
||||
"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
|
||||
"title": "Requests by Model",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_llm_requests_total[5m])) by (model)",
|
||||
"legendFormat": "{{ model }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 30 } } },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
|
||||
"id": 32,
|
||||
"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
|
||||
"title": "Errors by Type",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_llm_errors_total[5m])) by (error_type)",
|
||||
"legendFormat": "{{ error_type }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 },
|
||||
"id": 40,
|
||||
"panels": [],
|
||||
"title": "Fallback Analysis",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"custom": { "fillOpacity": 20 },
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 },
|
||||
"id": 41,
|
||||
"title": "Google Fallback Rate Over Time",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
|
||||
"legendFormat": "Fallback %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short" } },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 },
|
||||
"id": 42,
|
||||
"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
|
||||
"title": "Provider Request Distribution (Pie)",
|
||||
"type": "piechart",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"ollama\"}[24h]))",
|
||||
"legendFormat": "Ollama"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"google\"}[24h]))",
|
||||
"legendFormat": "Google"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"openrouter\"}[24h]))",
|
||||
"legendFormat": "OpenRouter"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["mana", "llm", "ai"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "selected": false, "text": "VictoriaMetrics", "value": "victoria-metrics" },
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"title": "Mana LLM Gateway",
|
||||
"uid": "mana-llm",
|
||||
"version": 1
|
||||
}
|
||||
|
|
@ -3,7 +3,7 @@ groups:
|
|||
rules:
|
||||
# Service Down Alert
|
||||
- alert: ServiceDown
|
||||
expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|synapse"} == 0
|
||||
expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|mana-llm|synapse"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -356,3 +356,50 @@ groups:
|
|||
annotations:
|
||||
summary: "OIDC token endpoint errors"
|
||||
description: "OIDC token endpoint is returning 5xx errors. SSO may be affected."
|
||||
|
||||
- name: llm_alerts
|
||||
rules:
|
||||
# mana-llm Down
|
||||
- alert: LLMServiceDown
|
||||
expr: up{job="mana-llm"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "mana-llm service is down"
|
||||
description: "mana-llm has been down for more than 1 minute. All AI features will fail."
|
||||
|
||||
# High LLM Error Rate (> 10%)
|
||||
- alert: LLMHighErrorRate
|
||||
expr: |
|
||||
sum(rate(mana_llm_llm_errors_total[5m]))
|
||||
/ (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High LLM error rate"
|
||||
description: "{{ $value | humanizePercentage }} of LLM requests are failing."
|
||||
|
||||
# Ollama Provider Down (all requests going to fallback)
|
||||
- alert: OllamaProviderDown
|
||||
expr: |
|
||||
sum(rate(mana_llm_llm_requests_total{provider="google"}[5m]))
|
||||
/ (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Ollama appears down — most requests going to Google fallback"
|
||||
description: "{{ $value | humanizePercentage }} of LLM requests are using Google Gemini fallback."
|
||||
|
||||
# LLM Slow Responses (p95 > 30s)
|
||||
- alert: LLMSlowResponses
|
||||
expr: |
|
||||
histogram_quantile(0.95, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le)) > 30
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "LLM responses are slow"
|
||||
description: "LLM p95 latency is {{ $value | humanizeDuration }}."
|
||||
|
|
|
|||
|
|
@ -158,6 +158,13 @@ scrape_configs:
|
|||
# Core Services
|
||||
# ============================================
|
||||
|
||||
# Mana LLM Gateway (Ollama + Google Fallback)
|
||||
- job_name: 'mana-llm'
|
||||
static_configs:
|
||||
- targets: ['mana-llm:3025']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# Mana Search Service
|
||||
- job_name: 'mana-search'
|
||||
static_configs:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue