feat(monitoring): add alerting stack and maintenance scripts

Medium priority stability improvements: Alerting: - Add vmalert for evaluating Prometheus alert rules - Add alertmanager for alert routing and grouping - Add alert-notifier service for Telegram/ntfy notifications - Enable cadvisor scraping in prometheus config Disk Monitoring: - Add check-disk-space.sh for hourly disk monitoring - Alert on 80% (warning) and 90% (critical) thresholds - Auto-cleanup Docker when disk is critical - Add com.manacore.disk-check.plist for LaunchD Weekly Reports: - Add weekly-report.sh for system health summary - Includes: backup status, disk usage, container health, database stats, error log summary - Runs every Sunday at 10 AM via LaunchD Health Check Updates: - Add checks for vmalert, alertmanager, alert-notifier Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-14 19:01:08 +02:00 · 2026-02-12 13:46:57 +01:00 · 2026-02-12 13:46:57 +01:00 · acc8de36ee
commit acc8de36ee
parent 02a5172c7c
11 changed files with 996 additions and 10 deletions
--- a/docker/alertmanager/alertmanager.yml
+++ b/docker/alertmanager/alertmanager.yml
@ -0,0 +1,62 @@
+# Alertmanager Configuration for ManaCore
+# Sends alerts via webhook to custom notification handler
+
+global:
+  resolve_timeout: 5m
+
+route:
+  # Default receiver for all alerts
+  receiver: 'webhook'
+  # Group alerts by severity and service
+  group_by: ['alertname', 'severity', 'job']
+  # Wait before sending first notification
+  group_wait: 30s
+  # Wait before sending follow-up notifications for same group
+  group_interval: 5m
+  # Wait before re-sending resolved alerts
+  repeat_interval: 4h
+
+  routes:
+    # Critical alerts - immediate notification
+    - match:
+        severity: critical
+      receiver: 'webhook'
+      group_wait: 10s
+      repeat_interval: 1h
+
+    # Warning alerts - less frequent
+    - match:
+        severity: warning
+      receiver: 'webhook'
+      group_wait: 1m
+      repeat_interval: 6h
+
+    # Info alerts - only during business hours, batch together
+    - match:
+        severity: info
+      receiver: 'webhook'
+      group_wait: 5m
+      repeat_interval: 24h
+
+receivers:
+  - name: 'webhook'
+    webhook_configs:
+      - url: 'http://alert-notifier:8080/webhook'
+        send_resolved: true
+        max_alerts: 10
+
+# Inhibition rules - prevent redundant alerts
+inhibit_rules:
+  # Don't alert on warnings if critical is firing for same service
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'job']
+
+  # Don't alert on service-specific issues if PostgreSQL is down
+  - source_match:
+      alertname: 'PostgreSQLDown'
+    target_match_re:
+      alertname: '.*(Backend|Service).*'
+    equal: []