diff --git a/docker/grafana/dashboards/mana-llm.json b/docker/grafana/dashboards/mana-llm.json new file mode 100644 index 000000000..8c2df74e2 --- /dev/null +++ b/docker/grafana/dashboards/mana-llm.json @@ -0,0 +1,422 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Master Overview", + "type": "link", + "url": "/d/master-overview/master-overview" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Service Health", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { + "options": { + "0": { "color": "red", "index": 1, "text": "DOWN" }, + "1": { "color": "green", "index": 0, "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "mana-llm Status", + "type": "stat", + "targets": [{ "expr": "up{job=\"mana-llm\"}", "legendFormat": "Status" }] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Requests / min", + "type": "stat", + "targets": [ + { "expr": "sum(rate(mana_llm_llm_requests_total[5m])) * 60", "legendFormat": "req/min" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 20 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Error Rate", + "type": "stat", + "targets": [ + { + "expr": "sum(rate(mana_llm_llm_errors_total[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100", + "legendFormat": "errors" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "orange", "value": 30 }, + { "color": "red", "value": 50 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Google Fallback Rate", + "type": "stat", + "targets": [ + { + "expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100", + "legendFormat": "fallback %" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 30 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Avg Latency (p50)", + "type": "stat", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Tokens / min", + "type": "stat", + "targets": [ + { "expr": "sum(rate(mana_llm_tokens_total[5m])) * 60", "legendFormat": "tok/min" } + ] + }, + + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 10, + "panels": [], + "title": "Request Traffic", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } }, + "unit": "reqps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 11, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] } + }, + "title": "LLM Requests by Provider", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(mana_llm_llm_requests_total{provider=\"ollama\"}[5m])) by (provider)", + "legendFormat": "Ollama" + }, + { + "expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) by (provider)", + "legendFormat": "Google (Fallback)" + }, + { + "expr": "sum(rate(mana_llm_llm_requests_total{provider=\"openrouter\"}[5m])) by (provider)", + "legendFormat": "OpenRouter" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "custom": { "fillOpacity": 30, "stacking": { "mode": "normal" } }, + "unit": "short" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 12, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum", "mean"] } + }, + "title": "Tokens by Type", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(mana_llm_tokens_total{type=\"prompt\"}[5m])) * 60", + "legendFormat": "Prompt Tokens/min" + }, + { + "expr": "sum(rate(mana_llm_tokens_total{type=\"completion\"}[5m])) * 60", + "legendFormat": "Completion Tokens/min" + } + ] + }, + + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 20, + "panels": [], + "title": "Latency & Performance", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "s", "custom": { "fillOpacity": 10 } } }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, + "id": 21, + "options": { "legend": { "displayMode": "table", "placement": "bottom" } }, + "title": "LLM Latency Percentiles", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.90, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "p90" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "p99" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { "unit": "s", "custom": { "fillOpacity": 30, "stacking": { "mode": "none" } } } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, + "id": 22, + "options": { "legend": { "displayMode": "table", "placement": "bottom" } }, + "title": "Latency by Provider (p50)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"ollama\"}[5m])) by (le))", + "legendFormat": "Ollama p50" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(mana_llm_llm_latency_seconds_bucket{provider=\"google\"}[5m])) by (le))", + "legendFormat": "Google p50" + } + ] + }, + + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "id": 30, + "panels": [], + "title": "Models & Errors", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "reqps", "custom": { "fillOpacity": 30 } } }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 31, + "options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } }, + "title": "Requests by Model", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(mana_llm_llm_requests_total[5m])) by (model)", + "legendFormat": "{{ model }}" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 30 } } }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 32, + "options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } }, + "title": "Errors by Type", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(mana_llm_llm_errors_total[5m])) by (error_type)", + "legendFormat": "{{ error_type }}" + } + ] + }, + + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 }, + "id": 40, + "panels": [], + "title": "Fallback Analysis", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { "fillOpacity": 20 }, + "thresholds": { + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "red", "value": 50 } + ] + } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 }, + "id": 41, + "title": "Google Fallback Rate Over Time", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(mana_llm_llm_requests_total{provider=\"google\"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) * 100", + "legendFormat": "Fallback %" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "unit": "short" } }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 }, + "id": 42, + "options": { "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } }, + "title": "Provider Request Distribution (Pie)", + "type": "piechart", + "targets": [ + { + "expr": "sum(increase(mana_llm_llm_requests_total{provider=\"ollama\"}[24h]))", + "legendFormat": "Ollama" + }, + { + "expr": "sum(increase(mana_llm_llm_requests_total{provider=\"google\"}[24h]))", + "legendFormat": "Google" + }, + { + "expr": "sum(increase(mana_llm_llm_requests_total{provider=\"openrouter\"}[24h]))", + "legendFormat": "OpenRouter" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["mana", "llm", "ai"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "VictoriaMetrics", "value": "victoria-metrics" }, + "hide": 0, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "type": "datasource" + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "title": "Mana LLM Gateway", + "uid": "mana-llm", + "version": 1 +} diff --git a/docker/prometheus/alerts.yml b/docker/prometheus/alerts.yml index 789357edc..95bbd67d3 100644 --- a/docker/prometheus/alerts.yml +++ b/docker/prometheus/alerts.yml @@ -3,7 +3,7 @@ groups: rules: # Service Down Alert - alert: ServiceDown - expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|synapse"} == 0 + expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|mana-llm|synapse"} == 0 for: 1m labels: severity: critical @@ -356,3 +356,50 @@ groups: annotations: summary: "OIDC token endpoint errors" description: "OIDC token endpoint is returning 5xx errors. SSO may be affected." + + - name: llm_alerts + rules: + # mana-llm Down + - alert: LLMServiceDown + expr: up{job="mana-llm"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "mana-llm service is down" + description: "mana-llm has been down for more than 1 minute. All AI features will fail." + + # High LLM Error Rate (> 10%) + - alert: LLMHighErrorRate + expr: | + sum(rate(mana_llm_llm_errors_total[5m])) + / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "High LLM error rate" + description: "{{ $value | humanizePercentage }} of LLM requests are failing." + + # Ollama Provider Down (all requests going to fallback) + - alert: OllamaProviderDown + expr: | + sum(rate(mana_llm_llm_requests_total{provider="google"}[5m])) + / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.5 + for: 10m + labels: + severity: warning + annotations: + summary: "Ollama appears down — most requests going to Google fallback" + description: "{{ $value | humanizePercentage }} of LLM requests are using Google Gemini fallback." + + # LLM Slow Responses (p95 > 30s) + - alert: LLMSlowResponses + expr: | + histogram_quantile(0.95, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le)) > 30 + for: 5m + labels: + severity: warning + annotations: + summary: "LLM responses are slow" + description: "LLM p95 latency is {{ $value | humanizeDuration }}." diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 01149b445..d8aa77b2f 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -158,6 +158,13 @@ scrape_configs: # Core Services # ============================================ + # Mana LLM Gateway (Ollama + Google Fallback) + - job_name: 'mana-llm' + static_configs: + - targets: ['mana-llm:3025'] + metrics_path: '/metrics' + scrape_interval: 15s + # Mana Search Service - job_name: 'mana-search' static_configs: