feat(monitoring): add LLM Grafana dashboard, Prometheus scraping, and alerts

Wire mana-llm service into the monitoring stack: Prometheus (docker/prometheus/prometheus.yml): - Add mana-llm scrape job (port 3025, 15s interval) - Include mana-llm in ServiceDown alert expression Alerts (docker/prometheus/alerts.yml): - New llm_alerts group with 4 rules: - LLMServiceDown: mana-llm down > 1 min (critical) - LLMHighErrorRate: > 10% errors for 5 min (warning) - OllamaProviderDown: > 50% requests via Google fallback (warning) - LLMSlowResponses: p95 > 30s for 5 min (warning) Grafana Dashboard (docker/grafana/dashboards/mana-llm.json): - 6 stat panels: status, req/min, error rate, fallback rate, latency, tokens/min - Requests by Provider (stacked area: Ollama vs Google vs OpenRouter) - Tokens by Type (prompt vs completion) - Latency Percentiles (p50, p90, p99) - Latency by Provider comparison - Requests by Model breakdown - Errors by Type - Google Fallback Rate over time (with threshold coloring) - Provider Distribution pie chart (24h) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 20:01:09 +02:00 · 2026-03-24 11:16:27 +01:00 · 2026-03-24 11:16:27 +01:00 · 169821de1a
commit 169821de1a
parent 57a2841168
3 changed files with 477 additions and 1 deletions
--- a/docker/grafana/dashboards/mana-llm.json
+++ b/docker/grafana/dashboards/mana-llm.json
@ -0,0 +1,422 @@
+{
+	"annotations": { "list": [] },
+	"editable": true,
+	"fiscalYearStartMonth": 0,
+	"graphTooltip": 1,
+	"id": null,
+	"links": [
+		{
+			"asDropdown": false,
+			"icon": "external link",
+			"includeVars": true,
+			"keepTime": true,
+			"tags": [],
+			"targetBlank": false,
+			"title": "Master Overview",
+			"type": "link",
+			"url": "/d/master-overview/master-overview"
+		}
+	],
+	"panels": [
+		{
+			"collapsed": false,
+			"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+			"id": 1,
+			"panels": [],
+			"title": "Service Health",
+			"type": "row"
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": {
+				"defaults": {
+					"color": { "mode": "thresholds" },
+					"mappings": [
+						{
+							"options": {
+								"0": { "color": "red", "index": 1, "text": "DOWN" },
+								"1": { "color": "green", "index": 0, "text": "UP" }
+							},
+							"type": "value"
+						}
+					],
+					"thresholds": {
+						"steps": [
+							{ "color": "red", "value": null },
+							{ "color": "green", "value": 1 }
+						]
+					}
+				}
+			},
+			"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
+			"id": 2,
+			"options": {
+				"colorMode": "background",
+				"graphMode": "none",
+				"reduceOptions": { "calcs": ["lastNotNull"] }
+			},
+			"title": "mana-llm Status",
+			"type": "stat",
+			"targets": [{ "expr": "up{job=\"mana-llm\"}", "legendFormat": "Status" }]
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": { "defaults": { "unit": "short" } },
+			"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
+			"id": 3,
+			"options": {
+				"colorMode": "value",
+				"graphMode": "area",
+				"reduceOptions": { "calcs": ["lastNotNull"] }
+			},
+			"title": "Requests / min",
+			"type": "stat",
+			"targets": [
+				{ "expr": "sum(rate(mana_llm_llm_requests_total[5m])) * 60", "legendFormat": "req/min" }
+			]
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": {
+				"defaults": {
+					"unit": "percent",
+					"thresholds": {
+						"steps": [
+							{ "color": "green", "value": null },
+							{ "color": "yellow", "value": 5 },
+							{ "color": "red", "value": 20 }
+						]
+					}
+				}
+			},
+			"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
+			"id": 4,
+			"options": {
+				"colorMode": "value",
+				"graphMode": "area",
+				"reduceOptions": { "calcs": ["lastNotNull"] }
+			},
+			"title": "Error Rate",
+			"type": "stat",
+			"targets": [
+				{
+					"expr": "sum(rate(mana_llm_llm_errors_total[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
+					"legendFormat": "errors"
+				}
+			]
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": {
+				"defaults": {
+					"unit": "percent",
+					"thresholds": {
+						"steps": [
+							{ "color": "green", "value": null },
+							{ "color": "yellow", "value": 10 },
+							{ "color": "orange", "value": 30 },
+							{ "color": "red", "value": 50 }
+						]
+					}
+				}
+			},
+			"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
+			"id": 5,
+			"options": {
+				"colorMode": "value",
+				"graphMode": "area",
+				"reduceOptions": { "calcs": ["lastNotNull"] }
+			},
+			"title": "Google Fallback Rate",
+			"type": "stat",
+			"targets": [
+				{
+					"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
+					"legendFormat": "fallback %"
+				}
+			]
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": {
+				"defaults": {
+					"unit": "s",
+					"thresholds": {
+						"steps": [
+							{ "color": "green", "value": null },
+							{ "color": "yellow", "value": 5 },
+							{ "color": "red", "value": 30 }
+						]
+					}
+				}
+			},
+			"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
+			"id": 6,
+			"options": {
+				"colorMode": "value",
+				"graphMode": "area",
+				"reduceOptions": { "calcs": ["lastNotNull"] }
+			},
+			"title": "Avg Latency (p50)",
+			"type": "stat",
+			"targets": [
+				{
+					"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
+					"legendFormat": "p50"
+				}
+			]
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": { "defaults": { "unit": "short" } },
+			"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
+			"id": 7,
+			"options": {
+				"colorMode": "value",
+				"graphMode": "area",
+				"reduceOptions": { "calcs": ["lastNotNull"] }
+			},
+			"title": "Tokens / min",
+			"type": "stat",
+			"targets": [
+				{ "expr": "sum(rate(mana_llm_tokens_total[5m])) * 60", "legendFormat": "tok/min" }
+			]
+		},
+
+		{
+			"collapsed": false,
+			"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
+			"id": 10,
+			"panels": [],
+			"title": "Request Traffic",
+			"type": "row"
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": {
+				"defaults": {
+					"custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } },
+					"unit": "reqps"
+				}
+			},
+			"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
+			"id": 11,
+			"options": {
+				"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] }
+			},
+			"title": "LLM Requests by Provider",
+			"type": "timeseries",
+			"targets": [
+				{
+					"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"ollama\"}[5m])) by (provider)",
+					"legendFormat": "Ollama"
+				},
+				{
+					"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) by (provider)",
+					"legendFormat": "Google (Fallback)"
+				},
+				{
+					"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"openrouter\"}[5m])) by (provider)",
+					"legendFormat": "OpenRouter"
+				}
+			]
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": {
+				"defaults": {
+					"custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } },
+					"unit": "short"
+				}
+			},
+			"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
+			"id": 12,
+			"options": {
+				"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] }
+			},
+			"title": "Tokens by Type",
+			"type": "timeseries",
+			"targets": [
+				{
+					"expr": "sum(rate(mana_llm_tokens_total{type=\"prompt\"}[5m])) * 60",
+					"legendFormat": "Prompt Tokens/min"
+				},
+				{
+					"expr": "sum(rate(mana_llm_tokens_total{type=\"completion\"}[5m])) * 60",
+					"legendFormat": "Completion Tokens/min"
+				}
+			]
+		},
+
+		{
+			"collapsed": false,
+			"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
+			"id": 20,
+			"panels": [],
+			"title": "Latency & Performance",
+			"type": "row"
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": { "defaults": { "unit": "s", "custom": { "fillOpacity": 10 } } },
+			"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
+			"id": 21,
+			"options": { "legend": { "displayMode": "table", "placement": "bottom" } },
+			"title": "LLM Latency Percentiles",
+			"type": "timeseries",
+			"targets": [
+				{
+					"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
+					"legendFormat": "p50"
+				},
+				{
+					"expr": "histogram_quantile(0.90, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
+					"legendFormat": "p90"
+				},
+				{
+					"expr": "histogram_quantile(0.99, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))",
+					"legendFormat": "p99"
+				}
+			]
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": {
+				"defaults": { "unit": "s", "custom": { "fillOpacity": 30, "stacking": { "mode": "none" } } }
+			},
+			"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
+			"id": 22,
+			"options": { "legend": { "displayMode": "table", "placement": "bottom" } },
+			"title": "Latency by Provider (p50)",
+			"type": "timeseries",
+			"targets": [
+				{
+					"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"ollama\"}[5m])) by (le))",
+					"legendFormat": "Ollama p50"
+				},
+				{
+					"expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"google\"}[5m])) by (le))",
+					"legendFormat": "Google p50"
+				}
+			]
+		},
+
+		{
+			"collapsed": false,
+			"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
+			"id": 30,
+			"panels": [],
+			"title": "Models & Errors",
+			"type": "row"
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": { "defaults": { "unit": "reqps", "custom": { "fillOpacity": 30 } } },
+			"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
+			"id": 31,
+			"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
+			"title": "Requests by Model",
+			"type": "timeseries",
+			"targets": [
+				{
+					"expr": "sum(rate(mana_llm_llm_requests_total[5m])) by (model)",
+					"legendFormat": "{{ model }}"
+				}
+			]
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 30 } } },
+			"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
+			"id": 32,
+			"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
+			"title": "Errors by Type",
+			"type": "timeseries",
+			"targets": [
+				{
+					"expr": "sum(rate(mana_llm_llm_errors_total[5m])) by (error_type)",
+					"legendFormat": "{{ error_type }}"
+				}
+			]
+		},
+
+		{
+			"collapsed": false,
+			"gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 },
+			"id": 40,
+			"panels": [],
+			"title": "Fallback Analysis",
+			"type": "row"
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": {
+				"defaults": {
+					"unit": "percent",
+					"min": 0,
+					"max": 100,
+					"custom": { "fillOpacity": 20 },
+					"thresholds": {
+						"steps": [
+							{ "color": "green", "value": null },
+							{ "color": "yellow", "value": 10 },
+							{ "color": "red", "value": 50 }
+						]
+					}
+				}
+			},
+			"gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 },
+			"id": 41,
+			"title": "Google Fallback Rate Over Time",
+			"type": "timeseries",
+			"targets": [
+				{
+					"expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100",
+					"legendFormat": "Fallback %"
+				}
+			]
+		},
+		{
+			"datasource": { "type": "prometheus", "uid": "${datasource}" },
+			"fieldConfig": { "defaults": { "unit": "short" } },
+			"gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 },
+			"id": 42,
+			"options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } },
+			"title": "Provider Request Distribution (Pie)",
+			"type": "piechart",
+			"targets": [
+				{
+					"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"ollama\"}[24h]))",
+					"legendFormat": "Ollama"
+				},
+				{
+					"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"google\"}[24h]))",
+					"legendFormat": "Google"
+				},
+				{
+					"expr": "sum(increase(mana_llm_llm_requests_total{provider=\"openrouter\"}[24h]))",
+					"legendFormat": "OpenRouter"
+				}
+			]
+		}
+	],
+	"schemaVersion": 39,
+	"tags": ["mana", "llm", "ai"],
+	"templating": {
+		"list": [
+			{
+				"current": { "selected": false, "text": "VictoriaMetrics", "value": "victoria-metrics" },
+				"hide": 0,
+				"includeAll": false,
+				"name": "datasource",
+				"options": [],
+				"query": "prometheus",
+				"type": "datasource"
+			}
+		]
+	},
+	"time": { "from": "now-6h", "to": "now" },
+	"title": "Mana LLM Gateway",
+	"uid": "mana-llm",
+	"version": 1
+}
--- a/docker/prometheus/alerts.yml
+++ b/docker/prometheus/alerts.yml
@ -3,7 +3,7 @@ groups:
    rules:
      # Service Down Alert
      - alert: ServiceDown
-        expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|synapse"} == 0
+        expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|mana-llm|synapse"} == 0
        for: 1m
        labels:
          severity: critical
@ -356,3 +356,50 @@ groups:
        annotations:
          summary: "OIDC token endpoint errors"
          description: "OIDC token endpoint is returning 5xx errors. SSO may be affected."
+
+  - name: llm_alerts
+    rules:
+      # mana-llm Down
+      - alert: LLMServiceDown
+        expr: up{job="mana-llm"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "mana-llm service is down"
+          description: "mana-llm has been down for more than 1 minute. All AI features will fail."
+
+      # High LLM Error Rate (> 10%)
+      - alert: LLMHighErrorRate
+        expr: |
+          sum(rate(mana_llm_llm_errors_total[5m]))
+          / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High LLM error rate"
+          description: "{{ $value | humanizePercentage }} of LLM requests are failing."
+
+      # Ollama Provider Down (all requests going to fallback)
+      - alert: OllamaProviderDown
+        expr: |
+          sum(rate(mana_llm_llm_requests_total{provider="google"}[5m]))
+          / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.5
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Ollama appears down — most requests going to Google fallback"
+          description: "{{ $value | humanizePercentage }} of LLM requests are using Google Gemini fallback."
+
+      # LLM Slow Responses (p95 > 30s)
+      - alert: LLMSlowResponses
+        expr: |
+          histogram_quantile(0.95, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le)) > 30
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "LLM responses are slow"
+          description: "LLM p95 latency is {{ $value | humanizeDuration }}."
--- a/docker/prometheus/prometheus.yml
+++ b/docker/prometheus/prometheus.yml
@ -158,6 +158,13 @@ scrape_configs:
  # Core Services
  # ============================================

+  # Mana LLM Gateway (Ollama + Google Fallback)
+  - job_name: 'mana-llm'
+    static_configs:
+      - targets: ['mana-llm:3025']
+    metrics_path: '/metrics'
+    scrape_interval: 15s
+
  # Mana Search Service
  - job_name: 'mana-search'
    static_configs: