feat(monitoring): add alerting stack and maintenance scripts

Medium priority stability improvements:

Alerting:
- Add vmalert for evaluating Prometheus alert rules
- Add alertmanager for alert routing and grouping
- Add alert-notifier service for Telegram/ntfy notifications
- Enable cadvisor scraping in prometheus config

Disk Monitoring:
- Add check-disk-space.sh for hourly disk monitoring
- Alert on 80% (warning) and 90% (critical) thresholds
- Auto-cleanup Docker when disk is critical
- Add com.manacore.disk-check.plist for LaunchD

Weekly Reports:
- Add weekly-report.sh for system health summary
- Includes: backup status, disk usage, container health,
  database stats, error log summary
- Runs every Sunday at 10 AM via LaunchD

Health Check Updates:
- Add checks for vmalert, alertmanager, alert-notifier

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Till-JS 2026-02-12 13:46:57 +01:00
parent 02a5172c7c
commit acc8de36ee
11 changed files with 996 additions and 10 deletions

View file

@ -1431,6 +1431,11 @@ services:
- /dev/disk/:/dev/disk:ro
ports:
- "9110:8080"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"]
interval: 30s
timeout: 10s
retries: 3
postgres-exporter:
image: prometheuscommunity/postgres-exporter:v0.15.0
@ -1481,6 +1486,80 @@ services:
timeout: 10s
retries: 3
# ============================================
# Alerting Stack (Ports 9093-9095)
# ============================================
vmalert:
image: victoriametrics/vmalert:v1.99.0
container_name: mana-mon-vmalert
restart: always
depends_on:
victoriametrics:
condition: service_healthy
alertmanager:
condition: service_healthy
command:
- '-datasource.url=http://victoriametrics:9090'
- '-notifier.url=http://alertmanager:9093'
- '-remoteWrite.url=http://victoriametrics:9090'
- '-remoteRead.url=http://victoriametrics:9090'
- '-rule=/etc/alerts/*.yml'
- '-evaluationInterval=30s'
- '-httpListenAddr=:8880'
volumes:
- ./docker/prometheus/alerts.yml:/etc/alerts/alerts.yml:ro
ports:
- "8880:8880"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8880/health"]
interval: 30s
timeout: 10s
retries: 3
alertmanager:
image: prom/alertmanager:v0.27.0
container_name: mana-mon-alertmanager
restart: always
depends_on:
alert-notifier:
condition: service_healthy
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.listen-address=:9093'
volumes:
- ./docker/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
ports:
- "9093:9093"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9093/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
alert-notifier:
build:
context: ./docker/alert-notifier
dockerfile: Dockerfile
image: alert-notifier:local
container_name: mana-mon-alert-notifier
restart: always
environment:
PORT: 8080
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID:-}
NTFY_TOPIC: ${NTFY_TOPIC:-}
ports:
- "9095:8080"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"]
interval: 30s
timeout: 5s
retries: 3
start_period: 5s
# ============================================
# Auto-Update (Watchtower)
# ============================================
@ -1508,6 +1587,8 @@ volumes:
name: mana-redis-data
victoriametrics_data:
name: mana-victoria-data
alertmanager_data:
name: mana-alertmanager-data
grafana_data:
name: mana-grafana-data
analytics_data: