mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-16 16:19:40 +02:00
Wire mana-llm service into the monitoring stack: Prometheus (docker/prometheus/prometheus.yml): - Add mana-llm scrape job (port 3025, 15s interval) - Include mana-llm in ServiceDown alert expression Alerts (docker/prometheus/alerts.yml): - New llm_alerts group with 4 rules: - LLMServiceDown: mana-llm down > 1 min (critical) - LLMHighErrorRate: > 10% errors for 5 min (warning) - OllamaProviderDown: > 50% requests via Google fallback (warning) - LLMSlowResponses: p95 > 30s for 5 min (warning) Grafana Dashboard (docker/grafana/dashboards/mana-llm.json): - 6 stat panels: status, req/min, error rate, fallback rate, latency, tokens/min - Requests by Provider (stacked area: Ollama vs Google vs OpenRouter) - Tokens by Type (prompt vs completion) - Latency Percentiles (p50, p90, p99) - Latency by Provider comparison - Requests by Model breakdown - Errors by Type - Google Fallback Rate over time (with threshold coloring) - Provider Distribution pie chart (24h) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
422 lines
11 KiB
JSON
422 lines
11 KiB
JSON
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [
|
|
{
|
|
"asDropdown": false,
|
|
"icon": "external link",
|
|
"includeVars": true,
|
|
"keepTime": true,
|
|
"tags": [],
|
|
"targetBlank": false,
|
|
"title": "Master Overview",
|
|
"type": "link",
|
|
"url": "/d/master-overview/master-overview"
|
|
}
|
|
],
|
|
"panels": [
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
|
"id": 1,
|
|
"panels": [],
|
|
"title": "Service Health",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": { "mode": "thresholds" },
|
|
"mappings": [
|
|
{
|
|
"options": {
|
|
"0": { "color": "red", "index": 1, "text": "DOWN" },
|
|
"1": { "color": "green", "index": 0, "text": "UP" }
|
|
},
|
|
"type": "value"
|
|
}
|
|
],
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "red", "value": null },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
|
|
"id": 2,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"graphMode": "none",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"title": "mana-llm Status",
|
|
"type": "stat",
|
|
"targets": [{ "expr": "up{job=\"mana-llm\"}", "legendFormat": "Status" }]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": { "defaults": { "unit": "short" } },
|
|
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
|
|
"id": 3,
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"title": "Requests / min",
|
|
"type": "stat",
|
|
"targets": [
|
|
{ "expr": "sum(rate(mana_llm_llm_requests_total[5m])) * 60", "legendFormat": "req/min" }
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 5 },
|
|
{ "color": "red", "value": 20 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
|
|
"id": 4,
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"title": "Error Rate",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(mana_llm_llm_errors_total[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
|
|
"legendFormat": "errors"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 10 },
|
|
{ "color": "orange", "value": 30 },
|
|
{ "color": "red", "value": 50 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
|
|
"id": 5,
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"title": "Google Fallback Rate",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
|
|
"legendFormat": "fallback %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "s",
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 5 },
|
|
{ "color": "red", "value": 30 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
|
|
"id": 6,
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"title": "Avg Latency (p50)",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p50"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": { "defaults": { "unit": "short" } },
|
|
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
|
|
"id": 7,
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"reduceOptions": { "calcs": ["lastNotNull"] }
|
|
},
|
|
"title": "Tokens / min",
|
|
"type": "stat",
|
|
"targets": [
|
|
{ "expr": "sum(rate(mana_llm_tokens_total[5m])) * 60", "legendFormat": "tok/min" }
|
|
]
|
|
},
|
|
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
|
"id": 10,
|
|
"panels": [],
|
|
"title": "Request Traffic",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } },
|
|
"unit": "reqps"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
|
"id": 11,
|
|
"options": {
|
|
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] }
|
|
},
|
|
"title": "LLM Requests by Provider",
|
|
"type": "timeseries",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"ollama\"}[5m])) by (provider)",
|
|
"legendFormat": "Ollama"
|
|
},
|
|
{
|
|
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) by (provider)",
|
|
"legendFormat": "Google (Fallback)"
|
|
},
|
|
{
|
|
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"openrouter\"}[5m])) by (provider)",
|
|
"legendFormat": "OpenRouter"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } },
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
|
"id": 12,
|
|
"options": {
|
|
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] }
|
|
},
|
|
"title": "Tokens by Type",
|
|
"type": "timeseries",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(mana_llm_tokens_total{type=\"prompt\"}[5m])) * 60",
|
|
"legendFormat": "Prompt Tokens/min"
|
|
},
|
|
{
|
|
"expr": "sum(rate(mana_llm_tokens_total{type=\"completion\"}[5m])) * 60",
|
|
"legendFormat": "Completion Tokens/min"
|
|
}
|
|
]
|
|
},
|
|
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
|
"id": 20,
|
|
"panels": [],
|
|
"title": "Latency & Performance",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": { "defaults": { "unit": "s", "custom": { "fillOpacity": 10 } } },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
|
|
"id": 21,
|
|
"options": { "legend": { "displayMode": "table", "placement": "bottom" } },
|
|
"title": "LLM Latency Percentiles",
|
|
"type": "timeseries",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p50"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.90, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p90"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.99, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p99"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "s", "custom": { "fillOpacity": 30, "stacking": { "mode": "none" } } }
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
|
|
"id": 22,
|
|
"options": { "legend": { "displayMode": "table", "placement": "bottom" } },
|
|
"title": "Latency by Provider (p50)",
|
|
"type": "timeseries",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"ollama\"}[5m])) by (le))",
|
|
"legendFormat": "Ollama p50"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"google\"}[5m])) by (le))",
|
|
"legendFormat": "Google p50"
|
|
}
|
|
]
|
|
},
|
|
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
|
|
"id": 30,
|
|
"panels": [],
|
|
"title": "Models & Errors",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": { "defaults": { "unit": "reqps", "custom": { "fillOpacity": 30 } } },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
|
|
"id": 31,
|
|
"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
|
|
"title": "Requests by Model",
|
|
"type": "timeseries",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(mana_llm_llm_requests_total[5m])) by (model)",
|
|
"legendFormat": "{{ model }}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 30 } } },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
|
|
"id": 32,
|
|
"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
|
|
"title": "Errors by Type",
|
|
"type": "timeseries",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(mana_llm_llm_errors_total[5m])) by (error_type)",
|
|
"legendFormat": "{{ error_type }}"
|
|
}
|
|
]
|
|
},
|
|
|
|
{
|
|
"collapsed": false,
|
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 },
|
|
"id": 40,
|
|
"panels": [],
|
|
"title": "Fallback Analysis",
|
|
"type": "row"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"min": 0,
|
|
"max": 100,
|
|
"custom": { "fillOpacity": 20 },
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 10 },
|
|
{ "color": "red", "value": 50 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 },
|
|
"id": 41,
|
|
"title": "Google Fallback Rate Over Time",
|
|
"type": "timeseries",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
|
|
"legendFormat": "Fallback %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
|
"fieldConfig": { "defaults": { "unit": "short" } },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 },
|
|
"id": 42,
|
|
"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
|
|
"title": "Provider Request Distribution (Pie)",
|
|
"type": "piechart",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"ollama\"}[24h]))",
|
|
"legendFormat": "Ollama"
|
|
},
|
|
{
|
|
"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"google\"}[24h]))",
|
|
"legendFormat": "Google"
|
|
},
|
|
{
|
|
"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"openrouter\"}[24h]))",
|
|
"legendFormat": "OpenRouter"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"schemaVersion": 39,
|
|
"tags": ["mana", "llm", "ai"],
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"current": { "selected": false, "text": "VictoriaMetrics", "value": "victoria-metrics" },
|
|
"hide": 0,
|
|
"includeAll": false,
|
|
"name": "datasource",
|
|
"options": [],
|
|
"query": "prometheus",
|
|
"type": "datasource"
|
|
}
|
|
]
|
|
},
|
|
"time": { "from": "now-6h", "to": "now" },
|
|
"title": "Mana LLM Gateway",
|
|
"uid": "mana-llm",
|
|
"version": 1
|
|
}
|