managarten/docker/prometheus/alerts.yml

groups:
  - name: service_alerts
    rules:
      # Service Down Alert
      - alert: ServiceDown
        expr: up{job=~"mana-auth|.*-backend|mana-search|mana-media|mana-llm"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.job }} has been down for more than 1 minute."

      # High Error Rate (> 5% of requests are 5xx)
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m])) by (job)
          / sum(rate(http_requests_total[5m])) by (job) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 5%)"

      # Very High Error Rate (> 20% of requests are 5xx)
      - alert: VeryHighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m])) by (job)
          / sum(rate(http_requests_total[5m])) by (job) > 0.20
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Very high error rate on {{ $labels.job }}"
          description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 20%)"

      # Slow Response Time (p95 > 2s)
      - alert: SlowResponseTime
        expr: |
          histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Slow response time on {{ $labels.job }}"
          description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}"

      # Very Slow Response Time (p95 > 5s)
      - alert: VerySlowResponseTime
        expr: |
          histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 5
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Very slow response time on {{ $labels.job }}"
          description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}"

      # High Memory Usage (Node.js heap > 500MB)
      - alert: HighHeapMemory
        expr: nodejs_heap_size_used_bytes > 500 * 1024 * 1024
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High heap memory on {{ $labels.job }}"
          description: "{{ $labels.job }} heap usage is {{ $value | humanize1024 }}B"

      # Event Loop Lag (> 100ms)
      - alert: HighEventLoopLag
        expr: nodejs_eventloop_lag_seconds > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High event loop lag on {{ $labels.job }}"
          description: "{{ $labels.job }} event loop lag is {{ $value | humanizeDuration }}"

  - name: infrastructure_alerts
    rules:
      # High CPU Usage (> 80%)
      - alert: HighCPUUsage
        expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on host"
          description: "CPU usage is {{ $value | humanize }}%"

      # Very High CPU Usage (> 95%)
      - alert: VeryHighCPUUsage
        expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Very high CPU usage on host"
          description: "CPU usage is {{ $value | humanize }}%"

      # High Memory Usage (> 85%)
      - alert: HighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on host"
          description: "Memory usage is {{ $value | humanize }}%"

      # Very High Memory Usage (> 95%)
      - alert: VeryHighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Very high memory usage on host"
          description: "Memory usage is {{ $value | humanize }}%"

      # High Disk Usage — macOS host disks (via Pushgateway, since node-exporter runs in VM)
      # Metrics pushed by scripts/mac-mini/disk-metrics.sh (runs every 5 min via launchd)
      - alert: HighDiskUsage
        expr: |
          mac_disk_used_percent{disk=~"internal|manaData"} > 80
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High disk usage on {{ $labels.disk }} ({{ $labels.mountpoint }})"
          description: "Disk usage is {{ $value | humanize }}% — {{ $labels.avail_human }} free"

      # Very High Disk Usage (> 90%) — immediate alert
      - alert: VeryHighDiskUsage
        expr: |
          mac_disk_used_percent{disk=~"internal|manaData"} > 90
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "CRITICAL: Disk {{ $labels.disk }} almost full ({{ $labels.mountpoint }})"
          description: "Disk usage is {{ $value | humanize }}% — only {{ $labels.avail_human }} free. Server may crash."

      # Colima VM disk large (> 150GB actual usage on sparse datadisk)
      - alert: ColimaVMDiskLarge
        expr: |
          mac_colima_disk_used_gb > 150
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: "Colima VM disk is {{ $value | humanize }}GB — consider pruning Docker images"
          description: "Run: docker system prune -f && docker image prune -a"

  - name: database_alerts
    rules:
      # PostgreSQL Down
      - alert: PostgreSQLDown
        expr: pg_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "PostgreSQL is down"
          description: "PostgreSQL has been down for more than 1 minute."

      # Redis Down
      - alert: RedisDown
        expr: redis_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Redis is down"
          description: "Redis has been down for more than 1 minute."

      # PostgreSQL High Connections (> 80)
      - alert: PostgreSQLHighConnections
        expr: sum(pg_stat_activity_count) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High PostgreSQL connections"
          description: "PostgreSQL has {{ $value }} connections (> 80)"

      # PostgreSQL Low Cache Hit Ratio (< 90%)
      - alert: PostgreSQLLowCacheHitRatio
        expr: |
          avg(pg_stat_database_blks_hit{datname!~"template.*|postgres"}
          / (pg_stat_database_blks_hit{datname!~"template.*|postgres"}
          + pg_stat_database_blks_read{datname!~"template.*|postgres"} + 0.0001)) * 100 < 90
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "PostgreSQL low cache hit ratio"
          description: "PostgreSQL cache hit ratio is {{ $value | humanize }}%"

      # Redis High Memory (> 1GB)
      - alert: RedisHighMemory
        expr: redis_memory_used_bytes > 1024 * 1024 * 1024
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Redis high memory usage"
          description: "Redis memory usage is {{ $value | humanize1024 }}B"

      # Redis Blocked Clients
      - alert: RedisBlockedClients
        expr: redis_blocked_clients > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Redis has blocked clients"
          description: "Redis has {{ $value }} blocked clients"

  - name: container_alerts
    rules:
      # Container High CPU (> 80% of limit)
      - alert: ContainerHighCPU
        expr: |
          sum(rate(container_cpu_usage_seconds_total{id=~"/docker/.+"}[5m])) by (name) * 100 > 80
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high CPU"
          description: "Container {{ $labels.name }} CPU usage is {{ $value | humanize }}%"

      # Container High Memory (> 80% of limit)
      - alert: ContainerHighMemory
        expr: |
          container_memory_usage_bytes{id=~"/docker/.+"}
          / container_spec_memory_limit_bytes{id=~"/docker/.+"} * 100 > 80
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high memory"
          description: "Container {{ $labels.name }} memory usage is {{ $value | humanize }}%"

      # Container Restart
      - alert: ContainerRestarted
        expr: |
          increase(container_start_time_seconds{id=~"/docker/.+"}[5m]) > 0
        for: 0m
        labels:
          severity: info
        annotations:
          summary: "Container {{ $labels.name }} restarted"
          description: "Container {{ $labels.name }} has restarted."

  - name: auth_service_alerts
    rules:
      # Auth Service Down
      - alert: AuthServiceDown
        expr: up{job="mana-auth"} == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Auth Service is down"
          description: "mana-auth has been down for more than 30 seconds. All authentication will fail."

      # High Login Failure Rate (> 50% of logins fail with 401)
      - alert: HighLoginFailureRate
        expr: |
          sum(rate(http_requests_total{job="mana-auth",route="/auth/login",status="401"}[5m]))
          / sum(rate(http_requests_total{job="mana-auth",route="/auth/login"}[5m])) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High login failure rate"
          description: "{{ $value | humanizePercentage }} of login attempts are failing."

      # Rate Limiting Triggered Frequently
      - alert: HighRateLimitHits
        expr: |
          sum(rate(http_requests_total{job="mana-auth",status="429"}[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Frequent rate limiting on Auth Service"
          description: "Rate limit (429) is being hit {{ $value | humanize }} times/second. Possible attack or misconfiguration."

      # Brute Force Detection (> 100 failed logins in 5 min)
      - alert: PossibleBruteForce
        expr: |
          sum(increase(http_requests_total{job="mana-auth",route="/auth/login",status="401"}[5m])) > 100
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "Possible brute force attack detected"
          description: "{{ $value | humanize }} failed login attempts in the last 5 minutes."

      # Registration Spike (unusual registration activity)
      - alert: RegistrationSpike
        expr: |
          sum(rate(http_requests_total{job="mana-auth",route="/auth/register",status="201"}[5m])) > 1
        for: 5m
        labels:
          severity: info
        annotations:
          summary: "High registration activity"
          description: "{{ $value | humanize }} registrations per second. Verify this is expected."

      # Token Refresh Failures
      - alert: HighTokenRefreshFailures
        expr: |
          sum(rate(http_requests_total{job="mana-auth",route="/auth/refresh",status=~"4.."}[5m]))
          / sum(rate(http_requests_total{job="mana-auth",route="/auth/refresh"}[5m])) > 0.3
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High token refresh failure rate"
          description: "{{ $value | humanizePercentage }} of token refresh attempts are failing."

      # Password Reset Flood (possible enumeration attack)
      - alert: PasswordResetFlood
        expr: |
          sum(increase(http_requests_total{job="mana-auth",route="/auth/forgot-password"}[5m])) > 50
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: "Unusual password reset activity"
          description: "{{ $value | humanize }} password reset requests in the last 5 minutes."

      # Low User Verification Rate (less than 50% verified after 1 week)
      - alert: LowVerificationRate
        expr: |
          auth_users_verified{job="mana-auth"} / auth_users_total{job="mana-auth"} < 0.5
        for: 1h
        labels:
          severity: info
        annotations:
          summary: "Low email verification rate"
          description: "Only {{ $value | humanizePercentage }} of users have verified their email."

      # Auth Service Slow (p95 > 500ms)
      - alert: AuthServiceSlow
        expr: |
          histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job="mana-auth"}[5m])) by (le)) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Auth Service responding slowly"
          description: "Auth service p95 latency is {{ $value | humanizeDuration }}. This may impact all services."

      # OIDC Token Endpoint Errors
      - alert: OIDCTokenErrors
        expr: |
          sum(rate(http_requests_total{job="mana-auth",route=~"/api/auth/oauth2/token|/api/oidc/token",status=~"5.."}[5m])) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "OIDC token endpoint errors"
          description: "OIDC token endpoint is returning 5xx errors. SSO may be affected."

  - name: uptime_alerts
    rules:
      # Web App offline (HTTP probe failed)
      - alert: WebAppDown
        expr: probe_success{job="blackbox-web"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Web App offline: {{ $labels.instance }}"
          description: "{{ $labels.instance }} hat seit 2 Minuten keine gültige HTTP-Antwort zurückgegeben."

      # API Health Endpoint offline
      - alert: APIDown
        expr: probe_success{job="blackbox-api"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "API offline: {{ $labels.instance }}"
          description: "{{ $labels.instance }} antwortet nicht auf den Health-Endpoint."

      # Infra Tool offline (Grafana, Git, etc.)
      - alert: InfraToolDown
        expr: probe_success{job="blackbox-infra"} == 0
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: "Infra-Dienst offline: {{ $labels.instance }}"
          description: "{{ $labels.instance }} ist seit 3 Minuten nicht erreichbar."

      # GPU Server Service offline
      - alert: GPUServiceDown
        expr: probe_success{job="blackbox-gpu"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "GPU-Dienst offline: {{ $labels.instance }}"
          description: "{{ $labels.instance }} (GPU-Server) ist seit 5 Minuten nicht erreichbar."

      # Slow HTTP response (> 5s)
      - alert: SlowHTTPResponse
        expr: probe_duration_seconds{job=~"blackbox-web|blackbox-api"} > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Langsame HTTP-Antwort: {{ $labels.instance }}"
          description: "{{ $labels.instance }} antwortet mit {{ $value | humanizeDuration }} (> 5s)."

  - name: llm_alerts
    rules:
      # mana-llm Down
      - alert: LLMServiceDown
        expr: up{job="mana-llm"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "mana-llm service is down"
          description: "mana-llm has been down for more than 1 minute. All AI features will fail."

      # High LLM Error Rate (> 10%)
      - alert: LLMHighErrorRate
        expr: |
          sum(rate(mana_llm_llm_errors_total[5m]))
          / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High LLM error rate"
          description: "{{ $value | humanizePercentage }} of LLM requests are failing."

      # Ollama Provider Down (all requests going to fallback)
      - alert: OllamaProviderDown
        expr: |
          sum(rate(mana_llm_llm_requests_total{provider="google"}[5m]))
          / (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.5
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Ollama appears down — most requests going to Google fallback"
          description: "{{ $value | humanizePercentage }} of LLM requests are using Google Gemini fallback."

      # LLM Slow Responses (p95 > 30s)
      - alert: LLMSlowResponses
        expr: |
          histogram_quantile(0.95, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le)) > 30
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "LLM responses are slow"
          description: "LLM p95 latency is {{ $value | humanizeDuration }}."