feat(monitoring): add alerting stack and maintenance scripts

Medium priority stability improvements: Alerting: - Add vmalert for evaluating Prometheus alert rules - Add alertmanager for alert routing and grouping - Add alert-notifier service for Telegram/ntfy notifications - Enable cadvisor scraping in prometheus config Disk Monitoring: - Add check-disk-space.sh for hourly disk monitoring - Alert on 80% (warning) and 90% (critical) thresholds - Auto-cleanup Docker when disk is critical - Add com.manacore.disk-check.plist for LaunchD Weekly Reports: - Add weekly-report.sh for system health summary - Includes: backup status, disk usage, container health, database stats, error log summary - Runs every Sunday at 10 AM via LaunchD Health Check Updates: - Add checks for vmalert, alertmanager, alert-notifier Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-14 19:41:09 +02:00 · 2026-02-12 13:46:57 +01:00 · 2026-02-12 13:46:57 +01:00 · acc8de36ee
commit acc8de36ee
parent 02a5172c7c
11 changed files with 996 additions and 10 deletions
--- a/docker/prometheus/prometheus.yml
+++ b/docker/prometheus/prometheus.yml
@ -9,11 +9,11 @@ global:
 rule_files:
  - /etc/prometheus/alerts.yml

-# Alertmanager configuration (optional, for future use)
-# alerting:
-#   alertmanagers:
-#     - static_configs:
-#         - targets: []
+# Alertmanager configuration
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['alertmanager:9093']

 scrape_configs:
  # Prometheus self-monitoring
@ -30,10 +30,10 @@ scrape_configs:
        target_label: instance
        replacement: 'mac-mini'

-  # Docker container metrics via cAdvisor (disabled - container not deployed)
-  # - job_name: 'cadvisor'
-  #   static_configs:
-  #     - targets: ['cadvisor:8080']
+  # Docker container metrics via cAdvisor
+  - job_name: 'cadvisor'
+    static_configs:
+      - targets: ['cadvisor:8080']

  # PostgreSQL metrics
  - job_name: 'postgres'