mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 20:21:09 +02:00
feat(monitoring): add alerting stack and maintenance scripts
Medium priority stability improvements: Alerting: - Add vmalert for evaluating Prometheus alert rules - Add alertmanager for alert routing and grouping - Add alert-notifier service for Telegram/ntfy notifications - Enable cadvisor scraping in prometheus config Disk Monitoring: - Add check-disk-space.sh for hourly disk monitoring - Alert on 80% (warning) and 90% (critical) thresholds - Auto-cleanup Docker when disk is critical - Add com.manacore.disk-check.plist for LaunchD Weekly Reports: - Add weekly-report.sh for system health summary - Includes: backup status, disk usage, container health, database stats, error log summary - Runs every Sunday at 10 AM via LaunchD Health Check Updates: - Add checks for vmalert, alertmanager, alert-notifier Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
02a5172c7c
commit
acc8de36ee
11 changed files with 996 additions and 10 deletions
|
|
@ -1431,6 +1431,11 @@ services:
|
|||
- /dev/disk/:/dev/disk:ro
|
||||
ports:
|
||||
- "9110:8080"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
postgres-exporter:
|
||||
image: prometheuscommunity/postgres-exporter:v0.15.0
|
||||
|
|
@ -1481,6 +1486,80 @@ services:
|
|||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ============================================
|
||||
# Alerting Stack (Ports 9093-9095)
|
||||
# ============================================
|
||||
|
||||
vmalert:
|
||||
image: victoriametrics/vmalert:v1.99.0
|
||||
container_name: mana-mon-vmalert
|
||||
restart: always
|
||||
depends_on:
|
||||
victoriametrics:
|
||||
condition: service_healthy
|
||||
alertmanager:
|
||||
condition: service_healthy
|
||||
command:
|
||||
- '-datasource.url=http://victoriametrics:9090'
|
||||
- '-notifier.url=http://alertmanager:9093'
|
||||
- '-remoteWrite.url=http://victoriametrics:9090'
|
||||
- '-remoteRead.url=http://victoriametrics:9090'
|
||||
- '-rule=/etc/alerts/*.yml'
|
||||
- '-evaluationInterval=30s'
|
||||
- '-httpListenAddr=:8880'
|
||||
volumes:
|
||||
- ./docker/prometheus/alerts.yml:/etc/alerts/alerts.yml:ro
|
||||
ports:
|
||||
- "8880:8880"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8880/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.27.0
|
||||
container_name: mana-mon-alertmanager
|
||||
restart: always
|
||||
depends_on:
|
||||
alert-notifier:
|
||||
condition: service_healthy
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.listen-address=:9093'
|
||||
volumes:
|
||||
- ./docker/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
- alertmanager_data:/alertmanager
|
||||
ports:
|
||||
- "9093:9093"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9093/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
alert-notifier:
|
||||
build:
|
||||
context: ./docker/alert-notifier
|
||||
dockerfile: Dockerfile
|
||||
image: alert-notifier:local
|
||||
container_name: mana-mon-alert-notifier
|
||||
restart: always
|
||||
environment:
|
||||
PORT: 8080
|
||||
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
|
||||
TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID:-}
|
||||
NTFY_TOPIC: ${NTFY_TOPIC:-}
|
||||
ports:
|
||||
- "9095:8080"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
|
||||
# ============================================
|
||||
# Auto-Update (Watchtower)
|
||||
# ============================================
|
||||
|
|
@ -1508,6 +1587,8 @@ volumes:
|
|||
name: mana-redis-data
|
||||
victoriametrics_data:
|
||||
name: mana-victoria-data
|
||||
alertmanager_data:
|
||||
name: mana-alertmanager-data
|
||||
grafana_data:
|
||||
name: mana-grafana-data
|
||||
analytics_data:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue