mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 19:01:08 +02:00
feat(monitoring): add alerting stack and maintenance scripts
Medium priority stability improvements: Alerting: - Add vmalert for evaluating Prometheus alert rules - Add alertmanager for alert routing and grouping - Add alert-notifier service for Telegram/ntfy notifications - Enable cadvisor scraping in prometheus config Disk Monitoring: - Add check-disk-space.sh for hourly disk monitoring - Alert on 80% (warning) and 90% (critical) thresholds - Auto-cleanup Docker when disk is critical - Add com.manacore.disk-check.plist for LaunchD Weekly Reports: - Add weekly-report.sh for system health summary - Includes: backup status, disk usage, container health, database stats, error log summary - Runs every Sunday at 10 AM via LaunchD Health Check Updates: - Add checks for vmalert, alertmanager, alert-notifier Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
02a5172c7c
commit
acc8de36ee
11 changed files with 996 additions and 10 deletions
62
docker/alertmanager/alertmanager.yml
Normal file
62
docker/alertmanager/alertmanager.yml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# Alertmanager Configuration for ManaCore
|
||||
# Sends alerts via webhook to custom notification handler
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
# Default receiver for all alerts
|
||||
receiver: 'webhook'
|
||||
# Group alerts by severity and service
|
||||
group_by: ['alertname', 'severity', 'job']
|
||||
# Wait before sending first notification
|
||||
group_wait: 30s
|
||||
# Wait before sending follow-up notifications for same group
|
||||
group_interval: 5m
|
||||
# Wait before re-sending resolved alerts
|
||||
repeat_interval: 4h
|
||||
|
||||
routes:
|
||||
# Critical alerts - immediate notification
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'webhook'
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
|
||||
# Warning alerts - less frequent
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'webhook'
|
||||
group_wait: 1m
|
||||
repeat_interval: 6h
|
||||
|
||||
# Info alerts - only during business hours, batch together
|
||||
- match:
|
||||
severity: info
|
||||
receiver: 'webhook'
|
||||
group_wait: 5m
|
||||
repeat_interval: 24h
|
||||
|
||||
receivers:
|
||||
- name: 'webhook'
|
||||
webhook_configs:
|
||||
- url: 'http://alert-notifier:8080/webhook'
|
||||
send_resolved: true
|
||||
max_alerts: 10
|
||||
|
||||
# Inhibition rules - prevent redundant alerts
|
||||
inhibit_rules:
|
||||
# Don't alert on warnings if critical is firing for same service
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'job']
|
||||
|
||||
# Don't alert on service-specific issues if PostgreSQL is down
|
||||
- source_match:
|
||||
alertname: 'PostgreSQLDown'
|
||||
target_match_re:
|
||||
alertname: '.*(Backend|Service).*'
|
||||
equal: []
|
||||
Loading…
Add table
Add a link
Reference in a new issue