managarten/docker/prometheus/alerts.yml
Till JS 169821de1a feat(monitoring): add LLM Grafana dashboard, Prometheus scraping, and alerts
Wire mana-llm service into the monitoring stack:

Prometheus (docker/prometheus/prometheus.yml):
- Add mana-llm scrape job (port 3025, 15s interval)
- Include mana-llm in ServiceDown alert expression

Alerts (docker/prometheus/alerts.yml):
- New llm_alerts group with 4 rules:
  - LLMServiceDown: mana-llm down > 1 min (critical)
  - LLMHighErrorRate: > 10% errors for 5 min (warning)
  - OllamaProviderDown: > 50% requests via Google fallback (warning)
  - LLMSlowResponses: p95 > 30s for 5 min (warning)

Grafana Dashboard (docker/grafana/dashboards/mana-llm.json):
- 6 stat panels: status, req/min, error rate, fallback rate, latency, tokens/min
- Requests by Provider (stacked area: Ollama vs Google vs OpenRouter)
- Tokens by Type (prompt vs completion)
- Latency Percentiles (p50, p90, p99)
- Latency by Provider comparison
- Requests by Model breakdown
- Errors by Type
- Google Fallback Rate over time (with threshold coloring)
- Provider Distribution pie chart (24h)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 11:16:27 +01:00

405 lines
15 KiB
YAML

groups:
- name: service_alerts
rules:
# Service Down Alert
- alert: ServiceDown
expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|mana-llm|synapse"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "{{ $labels.job }} has been down for more than 1 minute."
# High Error Rate (> 5% of requests are 5xx)
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (job)
/ sum(rate(http_requests_total[5m])) by (job) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.job }}"
description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 5%)"
# Very High Error Rate (> 20% of requests are 5xx)
- alert: VeryHighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (job)
/ sum(rate(http_requests_total[5m])) by (job) > 0.20
for: 2m
labels:
severity: critical
annotations:
summary: "Very high error rate on {{ $labels.job }}"
description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 20%)"
# Slow Response Time (p95 > 2s)
- alert: SlowResponseTime
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Slow response time on {{ $labels.job }}"
description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}"
# Very Slow Response Time (p95 > 5s)
- alert: VerySlowResponseTime
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 5
for: 2m
labels:
severity: critical
annotations:
summary: "Very slow response time on {{ $labels.job }}"
description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}"
# High Memory Usage (Node.js heap > 500MB)
- alert: HighHeapMemory
expr: nodejs_heap_size_used_bytes > 500 * 1024 * 1024
for: 10m
labels:
severity: warning
annotations:
summary: "High heap memory on {{ $labels.job }}"
description: "{{ $labels.job }} heap usage is {{ $value | humanize1024 }}B"
# Event Loop Lag (> 100ms)
- alert: HighEventLoopLag
expr: nodejs_eventloop_lag_seconds > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High event loop lag on {{ $labels.job }}"
description: "{{ $labels.job }} event loop lag is {{ $value | humanizeDuration }}"
- name: infrastructure_alerts
rules:
# High CPU Usage (> 80%)
- alert: HighCPUUsage
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage on host"
description: "CPU usage is {{ $value | humanize }}%"
# Very High CPU Usage (> 95%)
- alert: VeryHighCPUUsage
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
for: 5m
labels:
severity: critical
annotations:
summary: "Very high CPU usage on host"
description: "CPU usage is {{ $value | humanize }}%"
# High Memory Usage (> 85%)
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage on host"
description: "Memory usage is {{ $value | humanize }}%"
# Very High Memory Usage (> 95%)
- alert: VeryHighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 5m
labels:
severity: critical
annotations:
summary: "Very high memory usage on host"
description: "Memory usage is {{ $value | humanize }}%"
# High Disk Usage (> 80%)
- alert: HighDiskUsage
expr: |
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"}
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High disk usage on {{ $labels.mountpoint }}"
description: "Disk usage is {{ $value | humanize }}%"
# Very High Disk Usage (> 90%)
- alert: VeryHighDiskUsage
expr: |
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"}
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "Very high disk usage on {{ $labels.mountpoint }}"
description: "Disk usage is {{ $value | humanize }}%"
- name: database_alerts
rules:
# PostgreSQL Down
- alert: PostgreSQLDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL is down"
description: "PostgreSQL has been down for more than 1 minute."
# Redis Down
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis is down"
description: "Redis has been down for more than 1 minute."
# PostgreSQL High Connections (> 80)
- alert: PostgreSQLHighConnections
expr: sum(pg_stat_activity_count) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High PostgreSQL connections"
description: "PostgreSQL has {{ $value }} connections (> 80)"
# PostgreSQL Low Cache Hit Ratio (< 90%)
- alert: PostgreSQLLowCacheHitRatio
expr: |
avg(pg_stat_database_blks_hit{datname!~"template.*|postgres"}
/ (pg_stat_database_blks_hit{datname!~"template.*|postgres"}
+ pg_stat_database_blks_read{datname!~"template.*|postgres"} + 0.0001)) * 100 < 90
for: 10m
labels:
severity: warning
annotations:
summary: "PostgreSQL low cache hit ratio"
description: "PostgreSQL cache hit ratio is {{ $value | humanize }}%"
# Redis High Memory (> 1GB)
- alert: RedisHighMemory
expr: redis_memory_used_bytes > 1024 * 1024 * 1024
for: 10m
labels:
severity: warning
annotations:
summary: "Redis high memory usage"
description: "Redis memory usage is {{ $value | humanize1024 }}B"
# Redis Blocked Clients
- alert: RedisBlockedClients
expr: redis_blocked_clients > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Redis has blocked clients"
description: "Redis has {{ $value }} blocked clients"
- name: container_alerts
rules:
# Container High CPU (> 80% of limit)
- alert: ContainerHighCPU
expr: |
sum(rate(container_cpu_usage_seconds_total{id=~"/docker/.+"}[5m])) by (name) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high CPU"
description: "Container {{ $labels.name }} CPU usage is {{ $value | humanize }}%"
# Container High Memory (> 80% of limit)
- alert: ContainerHighMemory
expr: |
container_memory_usage_bytes{id=~"/docker/.+"}
/ container_spec_memory_limit_bytes{id=~"/docker/.+"} * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high memory"
description: "Container {{ $labels.name }} memory usage is {{ $value | humanize }}%"
# Container Restart
- alert: ContainerRestarted
expr: |
increase(container_start_time_seconds{id=~"/docker/.+"}[5m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: "Container {{ $labels.name }} restarted"
description: "Container {{ $labels.name }} has restarted."
- name: auth_service_alerts
rules:
# Auth Service Down
- alert: AuthServiceDown
expr: up{job="mana-core-auth"} == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Auth Service is down"
description: "mana-core-auth has been down for more than 30 seconds. All authentication will fail."
# High Login Failure Rate (> 50% of logins fail with 401)
- alert: HighLoginFailureRate
expr: |
sum(rate(http_requests_total{job="mana-core-auth",route="/auth/login",status="401"}[5m]))
/ sum(rate(http_requests_total{job="mana-core-auth",route="/auth/login"}[5m])) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High login failure rate"
description: "{{ $value | humanizePercentage }} of login attempts are failing."
# Rate Limiting Triggered Frequently
- alert: HighRateLimitHits
expr: |
sum(rate(http_requests_total{job="mana-core-auth",status="429"}[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Frequent rate limiting on Auth Service"
description: "Rate limit (429) is being hit {{ $value | humanize }} times/second. Possible attack or misconfiguration."
# Brute Force Detection (> 100 failed logins in 5 min)
- alert: PossibleBruteForce
expr: |
sum(increase(http_requests_total{job="mana-core-auth",route="/auth/login",status="401"}[5m])) > 100
for: 0m
labels:
severity: critical
annotations:
summary: "Possible brute force attack detected"
description: "{{ $value | humanize }} failed login attempts in the last 5 minutes."
# Registration Spike (unusual registration activity)
- alert: RegistrationSpike
expr: |
sum(rate(http_requests_total{job="mana-core-auth",route="/auth/register",status="201"}[5m])) > 1
for: 5m
labels:
severity: info
annotations:
summary: "High registration activity"
description: "{{ $value | humanize }} registrations per second. Verify this is expected."
# Token Refresh Failures
- alert: HighTokenRefreshFailures
expr: |
sum(rate(http_requests_total{job="mana-core-auth",route="/auth/refresh",status=~"4.."}[5m]))
/ sum(rate(http_requests_total{job="mana-core-auth",route="/auth/refresh"}[5m])) > 0.3
for: 10m
labels:
severity: warning
annotations:
summary: "High token refresh failure rate"
description: "{{ $value | humanizePercentage }} of token refresh attempts are failing."
# Password Reset Flood (possible enumeration attack)
- alert: PasswordResetFlood
expr: |
sum(increase(http_requests_total{job="mana-core-auth",route="/auth/forgot-password"}[5m])) > 50
for: 0m
labels:
severity: warning
annotations:
summary: "Unusual password reset activity"
description: "{{ $value | humanize }} password reset requests in the last 5 minutes."
# Low User Verification Rate (less than 50% verified after 1 week)
- alert: LowVerificationRate
expr: |
auth_users_verified{job="mana-core-auth"} / auth_users_total{job="mana-core-auth"} < 0.5
for: 1h
labels:
severity: info
annotations:
summary: "Low email verification rate"
description: "Only {{ $value | humanizePercentage }} of users have verified their email."
# Auth Service Slow (p95 > 500ms)
- alert: AuthServiceSlow
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job="mana-core-auth"}[5m])) by (le)) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "Auth Service responding slowly"
description: "Auth service p95 latency is {{ $value | humanizeDuration }}. This may impact all services."
# OIDC Token Endpoint Errors
- alert: OIDCTokenErrors
expr: |
sum(rate(http_requests_total{job="mana-core-auth",route=~"/api/auth/oauth2/token|/api/oidc/token",status=~"5.."}[5m])) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "OIDC token endpoint errors"
description: "OIDC token endpoint is returning 5xx errors. SSO may be affected."
- name: llm_alerts
rules:
# mana-llm Down
- alert: LLMServiceDown
expr: up{job="mana-llm"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "mana-llm service is down"
description: "mana-llm has been down for more than 1 minute. All AI features will fail."
# High LLM Error Rate (> 10%)
- alert: LLMHighErrorRate
expr: |
sum(rate(mana_llm_llm_errors_total[5m]))
/ (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High LLM error rate"
description: "{{ $value | humanizePercentage }} of LLM requests are failing."
# Ollama Provider Down (all requests going to fallback)
- alert: OllamaProviderDown
expr: |
sum(rate(mana_llm_llm_requests_total{provider="google"}[5m]))
/ (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.5
for: 10m
labels:
severity: warning
annotations:
summary: "Ollama appears down — most requests going to Google fallback"
description: "{{ $value | humanizePercentage }} of LLM requests are using Google Gemini fallback."
# LLM Slow Responses (p95 > 30s)
- alert: LLMSlowResponses
expr: |
histogram_quantile(0.95, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le)) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "LLM responses are slow"
description: "LLM p95 latency is {{ $value | humanizeDuration }}."