groups: - name: service_alerts rules: # Service Down Alert - alert: ServiceDown expr: up{job=~"mana-core-auth|.*-backend|mana-search|mana-media|mana-llm|synapse"} == 0 for: 1m labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down" description: "{{ $labels.job }} has been down for more than 1 minute." # High Error Rate (> 5% of requests are 5xx) - alert: HighErrorRate expr: | sum(rate(http_requests_total{status=~"5.."}[5m])) by (job) / sum(rate(http_requests_total[5m])) by (job) > 0.05 for: 5m labels: severity: warning annotations: summary: "High error rate on {{ $labels.job }}" description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 5%)" # Very High Error Rate (> 20% of requests are 5xx) - alert: VeryHighErrorRate expr: | sum(rate(http_requests_total{status=~"5.."}[5m])) by (job) / sum(rate(http_requests_total[5m])) by (job) > 0.20 for: 2m labels: severity: critical annotations: summary: "Very high error rate on {{ $labels.job }}" description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 20%)" # Slow Response Time (p95 > 2s) - alert: SlowResponseTime expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 2 for: 5m labels: severity: warning annotations: summary: "Slow response time on {{ $labels.job }}" description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}" # Very Slow Response Time (p95 > 5s) - alert: VerySlowResponseTime expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 5 for: 2m labels: severity: critical annotations: summary: "Very slow response time on {{ $labels.job }}" description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}" # High Memory Usage (Node.js heap > 500MB) - alert: HighHeapMemory expr: nodejs_heap_size_used_bytes > 500 * 1024 * 1024 for: 10m labels: severity: warning annotations: summary: "High heap memory on {{ $labels.job }}" description: "{{ $labels.job }} heap usage is {{ $value | humanize1024 }}B" # Event Loop Lag (> 100ms) - alert: HighEventLoopLag expr: nodejs_eventloop_lag_seconds > 0.1 for: 5m labels: severity: warning annotations: summary: "High event loop lag on {{ $labels.job }}" description: "{{ $labels.job }} event loop lag is {{ $value | humanizeDuration }}" - name: infrastructure_alerts rules: # High CPU Usage (> 80%) - alert: HighCPUUsage expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 10m labels: severity: warning annotations: summary: "High CPU usage on host" description: "CPU usage is {{ $value | humanize }}%" # Very High CPU Usage (> 95%) - alert: VeryHighCPUUsage expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 5m labels: severity: critical annotations: summary: "Very high CPU usage on host" description: "CPU usage is {{ $value | humanize }}%" # High Memory Usage (> 85%) - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 10m labels: severity: warning annotations: summary: "High memory usage on host" description: "Memory usage is {{ $value | humanize }}%" # Very High Memory Usage (> 95%) - alert: VeryHighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 for: 5m labels: severity: critical annotations: summary: "Very high memory usage on host" description: "Memory usage is {{ $value | humanize }}%" # High Disk Usage — macOS host disks (via Pushgateway, since node-exporter runs in VM) # Metrics pushed by scripts/mac-mini/disk-metrics.sh (runs every 5 min via launchd) - alert: HighDiskUsage expr: | mac_disk_used_percent{disk=~"internal|manaData"} > 80 for: 10m labels: severity: warning annotations: summary: "High disk usage on {{ $labels.disk }} ({{ $labels.mountpoint }})" description: "Disk usage is {{ $value | humanize }}% — {{ $labels.avail_human }} free" # Very High Disk Usage (> 90%) — immediate alert - alert: VeryHighDiskUsage expr: | mac_disk_used_percent{disk=~"internal|manaData"} > 90 for: 2m labels: severity: critical annotations: summary: "CRITICAL: Disk {{ $labels.disk }} almost full ({{ $labels.mountpoint }})" description: "Disk usage is {{ $value | humanize }}% — only {{ $labels.avail_human }} free. Server may crash." # Colima VM disk large (> 150GB actual usage on sparse datadisk) - alert: ColimaVMDiskLarge expr: | mac_colima_disk_used_gb > 150 for: 30m labels: severity: warning annotations: summary: "Colima VM disk is {{ $value | humanize }}GB — consider pruning Docker images" description: "Run: docker system prune -f && docker image prune -a" - name: database_alerts rules: # PostgreSQL Down - alert: PostgreSQLDown expr: pg_up == 0 for: 1m labels: severity: critical annotations: summary: "PostgreSQL is down" description: "PostgreSQL has been down for more than 1 minute." # Redis Down - alert: RedisDown expr: redis_up == 0 for: 1m labels: severity: critical annotations: summary: "Redis is down" description: "Redis has been down for more than 1 minute." # PostgreSQL High Connections (> 80) - alert: PostgreSQLHighConnections expr: sum(pg_stat_activity_count) > 80 for: 5m labels: severity: warning annotations: summary: "High PostgreSQL connections" description: "PostgreSQL has {{ $value }} connections (> 80)" # PostgreSQL Low Cache Hit Ratio (< 90%) - alert: PostgreSQLLowCacheHitRatio expr: | avg(pg_stat_database_blks_hit{datname!~"template.*|postgres"} / (pg_stat_database_blks_hit{datname!~"template.*|postgres"} + pg_stat_database_blks_read{datname!~"template.*|postgres"} + 0.0001)) * 100 < 90 for: 10m labels: severity: warning annotations: summary: "PostgreSQL low cache hit ratio" description: "PostgreSQL cache hit ratio is {{ $value | humanize }}%" # Redis High Memory (> 1GB) - alert: RedisHighMemory expr: redis_memory_used_bytes > 1024 * 1024 * 1024 for: 10m labels: severity: warning annotations: summary: "Redis high memory usage" description: "Redis memory usage is {{ $value | humanize1024 }}B" # Redis Blocked Clients - alert: RedisBlockedClients expr: redis_blocked_clients > 0 for: 5m labels: severity: warning annotations: summary: "Redis has blocked clients" description: "Redis has {{ $value }} blocked clients" - name: container_alerts rules: # Container High CPU (> 80% of limit) - alert: ContainerHighCPU expr: | sum(rate(container_cpu_usage_seconds_total{id=~"/docker/.+"}[5m])) by (name) * 100 > 80 for: 10m labels: severity: warning annotations: summary: "Container {{ $labels.name }} high CPU" description: "Container {{ $labels.name }} CPU usage is {{ $value | humanize }}%" # Container High Memory (> 80% of limit) - alert: ContainerHighMemory expr: | container_memory_usage_bytes{id=~"/docker/.+"} / container_spec_memory_limit_bytes{id=~"/docker/.+"} * 100 > 80 for: 10m labels: severity: warning annotations: summary: "Container {{ $labels.name }} high memory" description: "Container {{ $labels.name }} memory usage is {{ $value | humanize }}%" # Container Restart - alert: ContainerRestarted expr: | increase(container_start_time_seconds{id=~"/docker/.+"}[5m]) > 0 for: 0m labels: severity: info annotations: summary: "Container {{ $labels.name }} restarted" description: "Container {{ $labels.name }} has restarted." - name: auth_service_alerts rules: # Auth Service Down - alert: AuthServiceDown expr: up{job="mana-core-auth"} == 0 for: 30s labels: severity: critical annotations: summary: "Auth Service is down" description: "mana-core-auth has been down for more than 30 seconds. All authentication will fail." # High Login Failure Rate (> 50% of logins fail with 401) - alert: HighLoginFailureRate expr: | sum(rate(http_requests_total{job="mana-core-auth",route="/auth/login",status="401"}[5m])) / sum(rate(http_requests_total{job="mana-core-auth",route="/auth/login"}[5m])) > 0.5 for: 5m labels: severity: warning annotations: summary: "High login failure rate" description: "{{ $value | humanizePercentage }} of login attempts are failing." # Rate Limiting Triggered Frequently - alert: HighRateLimitHits expr: | sum(rate(http_requests_total{job="mana-core-auth",status="429"}[5m])) > 1 for: 5m labels: severity: warning annotations: summary: "Frequent rate limiting on Auth Service" description: "Rate limit (429) is being hit {{ $value | humanize }} times/second. Possible attack or misconfiguration." # Brute Force Detection (> 100 failed logins in 5 min) - alert: PossibleBruteForce expr: | sum(increase(http_requests_total{job="mana-core-auth",route="/auth/login",status="401"}[5m])) > 100 for: 0m labels: severity: critical annotations: summary: "Possible brute force attack detected" description: "{{ $value | humanize }} failed login attempts in the last 5 minutes." # Registration Spike (unusual registration activity) - alert: RegistrationSpike expr: | sum(rate(http_requests_total{job="mana-core-auth",route="/auth/register",status="201"}[5m])) > 1 for: 5m labels: severity: info annotations: summary: "High registration activity" description: "{{ $value | humanize }} registrations per second. Verify this is expected." # Token Refresh Failures - alert: HighTokenRefreshFailures expr: | sum(rate(http_requests_total{job="mana-core-auth",route="/auth/refresh",status=~"4.."}[5m])) / sum(rate(http_requests_total{job="mana-core-auth",route="/auth/refresh"}[5m])) > 0.3 for: 10m labels: severity: warning annotations: summary: "High token refresh failure rate" description: "{{ $value | humanizePercentage }} of token refresh attempts are failing." # Password Reset Flood (possible enumeration attack) - alert: PasswordResetFlood expr: | sum(increase(http_requests_total{job="mana-core-auth",route="/auth/forgot-password"}[5m])) > 50 for: 0m labels: severity: warning annotations: summary: "Unusual password reset activity" description: "{{ $value | humanize }} password reset requests in the last 5 minutes." # Low User Verification Rate (less than 50% verified after 1 week) - alert: LowVerificationRate expr: | auth_users_verified{job="mana-core-auth"} / auth_users_total{job="mana-core-auth"} < 0.5 for: 1h labels: severity: info annotations: summary: "Low email verification rate" description: "Only {{ $value | humanizePercentage }} of users have verified their email." # Auth Service Slow (p95 > 500ms) - alert: AuthServiceSlow expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job="mana-core-auth"}[5m])) by (le)) > 0.5 for: 5m labels: severity: warning annotations: summary: "Auth Service responding slowly" description: "Auth service p95 latency is {{ $value | humanizeDuration }}. This may impact all services." # OIDC Token Endpoint Errors - alert: OIDCTokenErrors expr: | sum(rate(http_requests_total{job="mana-core-auth",route=~"/api/auth/oauth2/token|/api/oidc/token",status=~"5.."}[5m])) > 0.1 for: 5m labels: severity: warning annotations: summary: "OIDC token endpoint errors" description: "OIDC token endpoint is returning 5xx errors. SSO may be affected." - name: uptime_alerts rules: # Web App offline (HTTP probe failed) - alert: WebAppDown expr: probe_success{job="blackbox-web"} == 0 for: 2m labels: severity: critical annotations: summary: "Web App offline: {{ $labels.instance }}" description: "{{ $labels.instance }} hat seit 2 Minuten keine gültige HTTP-Antwort zurückgegeben." # API Health Endpoint offline - alert: APIDown expr: probe_success{job="blackbox-api"} == 0 for: 1m labels: severity: critical annotations: summary: "API offline: {{ $labels.instance }}" description: "{{ $labels.instance }} antwortet nicht auf den Health-Endpoint." # Infra Tool offline (Grafana, Git, etc.) - alert: InfraToolDown expr: probe_success{job="blackbox-infra"} == 0 for: 3m labels: severity: warning annotations: summary: "Infra-Dienst offline: {{ $labels.instance }}" description: "{{ $labels.instance }} ist seit 3 Minuten nicht erreichbar." # GPU Server Service offline - alert: GPUServiceDown expr: probe_success{job="blackbox-gpu"} == 0 for: 5m labels: severity: warning annotations: summary: "GPU-Dienst offline: {{ $labels.instance }}" description: "{{ $labels.instance }} (GPU-Server) ist seit 5 Minuten nicht erreichbar." # Slow HTTP response (> 5s) - alert: SlowHTTPResponse expr: probe_duration_seconds{job=~"blackbox-web|blackbox-api"} > 5 for: 5m labels: severity: warning annotations: summary: "Langsame HTTP-Antwort: {{ $labels.instance }}" description: "{{ $labels.instance }} antwortet mit {{ $value | humanizeDuration }} (> 5s)." - name: llm_alerts rules: # mana-llm Down - alert: LLMServiceDown expr: up{job="mana-llm"} == 0 for: 1m labels: severity: critical annotations: summary: "mana-llm service is down" description: "mana-llm has been down for more than 1 minute. All AI features will fail." # High LLM Error Rate (> 10%) - alert: LLMHighErrorRate expr: | sum(rate(mana_llm_llm_errors_total[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.1 for: 5m labels: severity: warning annotations: summary: "High LLM error rate" description: "{{ $value | humanizePercentage }} of LLM requests are failing." # Ollama Provider Down (all requests going to fallback) - alert: OllamaProviderDown expr: | sum(rate(mana_llm_llm_requests_total{provider="google"}[5m])) / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.5 for: 10m labels: severity: warning annotations: summary: "Ollama appears down — most requests going to Google fallback" description: "{{ $value | humanizePercentage }} of LLM requests are using Google Gemini fallback." # LLM Slow Responses (p95 > 30s) - alert: LLMSlowResponses expr: | histogram_quantile(0.95, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le)) > 30 for: 5m labels: severity: warning annotations: summary: "LLM responses are slow" description: "LLM p95 latency is {{ $value | humanizeDuration }}."