feat(monitoring): add alerting stack and maintenance scripts

Medium priority stability improvements:

Alerting:
- Add vmalert for evaluating Prometheus alert rules
- Add alertmanager for alert routing and grouping
- Add alert-notifier service for Telegram/ntfy notifications
- Enable cadvisor scraping in prometheus config

Disk Monitoring:
- Add check-disk-space.sh for hourly disk monitoring
- Alert on 80% (warning) and 90% (critical) thresholds
- Auto-cleanup Docker when disk is critical
- Add com.manacore.disk-check.plist for LaunchD

Weekly Reports:
- Add weekly-report.sh for system health summary
- Includes: backup status, disk usage, container health,
  database stats, error log summary
- Runs every Sunday at 10 AM via LaunchD

Health Check Updates:
- Add checks for vmalert, alertmanager, alert-notifier

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Till-JS 2026-02-12 13:46:57 +01:00
parent 02a5172c7c
commit acc8de36ee
11 changed files with 996 additions and 10 deletions

View file

@ -0,0 +1,233 @@
#!/bin/bash
# ManaCore Disk Space Monitor
# Checks disk usage on system and data volumes
# Alerts via Telegram/ntfy when thresholds are exceeded
#
# Thresholds:
# - Warning: 80%
# - Critical: 90%
#
# Run via LaunchD hourly
set -e
# Ensure PATH includes required tools
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
LOG_FILE="/tmp/manacore-disk-check.log"
# Thresholds
WARNING_THRESHOLD=80
CRITICAL_THRESHOLD=90
# Load notification config if exists
if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
source "$PROJECT_ROOT/.env.notifications"
fi
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
send_notification() {
local message="$1"
local priority="${2:-default}"
# Telegram
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" \
-d "text=${message}" \
-d "parse_mode=HTML" \
>/dev/null 2>&1 || true
fi
# ntfy
if [ -n "$NTFY_TOPIC" ]; then
local ntfy_priority="default"
[ "$priority" = "high" ] && ntfy_priority="high"
[ "$priority" = "critical" ] && ntfy_priority="urgent"
curl -s -d "$message" \
-H "Title: ManaCore Disk Alert" \
-H "Priority: $ntfy_priority" \
-H "Tags: warning" \
"https://ntfy.sh/$NTFY_TOPIC" >/dev/null 2>&1 || true
fi
}
check_disk() {
local mount_point="$1"
local name="$2"
# Check if mount point exists
if [ ! -d "$mount_point" ]; then
log "WARNING: Mount point $mount_point does not exist"
return 1
fi
# Get disk usage percentage (macOS compatible)
local usage
usage=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')
if [ -z "$usage" ]; then
log "ERROR: Could not get disk usage for $mount_point"
return 1
fi
# Get available space
local available
available=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')
log "$name: ${usage}% used (${available} free)"
# Check thresholds
if [ "$usage" -ge "$CRITICAL_THRESHOLD" ]; then
log "CRITICAL: $name at ${usage}%!"
send_notification "🚨 <b>CRITICAL: Disk Space</b>
<b>$name</b> is at <b>${usage}%</b>
Available: ${available}
Immediate action required!" "critical"
return 2
elif [ "$usage" -ge "$WARNING_THRESHOLD" ]; then
log "WARNING: $name at ${usage}%"
send_notification "⚠️ <b>WARNING: Disk Space</b>
<b>$name</b> is at <b>${usage}%</b>
Available: ${available}
Consider cleaning up old files." "high"
return 1
fi
return 0
}
check_docker_disk() {
# Check Docker disk usage
if ! command -v docker &> /dev/null; then
log "Docker not found in PATH"
return 0
fi
if ! docker info >/dev/null 2>&1; then
log "Docker is not running"
return 0
fi
# Get Docker disk usage
local docker_usage
docker_usage=$(docker system df --format '{{.Size}}' 2>/dev/null | head -1)
log "Docker disk usage: $docker_usage"
# Check for dangling images and unused volumes
local dangling_images
dangling_images=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')
local unused_volumes
unused_volumes=$(docker volume ls -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')
if [ "$dangling_images" -gt 10 ] || [ "$unused_volumes" -gt 5 ]; then
log "Docker cleanup recommended: $dangling_images dangling images, $unused_volumes unused volumes"
# Auto-cleanup if critical
local system_usage
system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')
if [ "$system_usage" -ge "$CRITICAL_THRESHOLD" ]; then
log "Running docker system prune due to critical disk usage..."
docker system prune -f --volumes 2>/dev/null || true
log "Docker cleanup completed"
fi
fi
}
check_postgres_backups() {
local backup_dir="/Volumes/ManaData/backups/postgres"
if [ ! -d "$backup_dir" ]; then
return 0
fi
# Get backup directory size
local backup_size
backup_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}')
log "PostgreSQL backups: $backup_size"
# Count old backups (older than 30 days in daily folder)
local old_backups
old_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime +30 2>/dev/null | wc -l | tr -d ' ')
if [ "$old_backups" -gt 0 ]; then
log "Note: $old_backups old daily backups could be cleaned up"
fi
}
check_docker_logs() {
# Check for large Docker log files
local docker_logs_dir="/var/lib/docker/containers"
# On macOS with Docker Desktop, logs are in the VM
# We can check via docker inspect instead
if ! docker info >/dev/null 2>&1; then
return 0
fi
# Get containers with largest log sizes
local large_logs=0
for container in $(docker ps -q 2>/dev/null); do
local log_size
log_size=$(docker inspect "$container" --format '{{.LogPath}}' 2>/dev/null | xargs -I {} docker run --rm -v /var/lib/docker:/var/lib/docker:ro alpine stat -c%s {} 2>/dev/null || echo "0")
# Convert to MB (if size > 100MB, flag it)
if [ "$log_size" -gt 104857600 ] 2>/dev/null; then
local container_name
container_name=$(docker inspect "$container" --format '{{.Name}}' 2>/dev/null | tr -d '/')
log "Large log file: $container_name ($(($log_size / 1048576))MB)"
large_logs=$((large_logs + 1))
fi
done 2>/dev/null || true
if [ "$large_logs" -gt 0 ]; then
log "Found $large_logs containers with large log files"
fi
}
# Main execution
log "=== ManaCore Disk Space Check ==="
ALERT_STATUS=0
# Check system disk
check_disk "/" "System Disk" || ALERT_STATUS=$?
# Check ManaData volume (external SSD)
if [ -d "/Volumes/ManaData" ]; then
check_disk "/Volumes/ManaData" "ManaData SSD" || ALERT_STATUS=$?
fi
# Check Docker disk usage
check_docker_disk
# Check backup sizes
check_postgres_backups
# Summary
log "=== Check Complete ==="
if [ "$ALERT_STATUS" -eq 2 ]; then
log "Status: CRITICAL - Immediate action required"
exit 2
elif [ "$ALERT_STATUS" -eq 1 ]; then
log "Status: WARNING - Attention needed"
exit 1
else
log "Status: OK - All disks within thresholds"
exit 0
fi

View file

@ -275,6 +275,12 @@ check_service "Grafana" "http://localhost:8000/api/health"
check_service "Umami" "http://localhost:8010/api/heartbeat"
check_service "VictoriaMetrics" "http://localhost:9090/health"
echo ""
echo "Alerting:"
check_service "vmalert" "http://localhost:8880/health"
check_service "Alertmanager" "http://localhost:9093/-/healthy"
check_service "Alert Notifier" "http://localhost:9095/health"
echo ""
echo "Cloudflare Tunnel:"
if pgrep -x "cloudflared" >/dev/null; then

View file

@ -17,8 +17,11 @@ for f in *.plist; do launchctl load ~/Library/LaunchAgents/$f; done
| Service | Description | Interval |
|---------|-------------|----------|
| `docker-startup` | Starts Docker containers on boot | At login |
| `ensure-containers` | Detects and restarts stuck containers | Every 5 min |
| `ensure-containers` | Detects and restarts stuck/crash-looping containers | Every 5 min |
| `health-check` | Checks all services and sends alerts | Every 5 min |
| `backup-databases` | PostgreSQL backup with daily/weekly rotation | Daily 3 AM |
| `disk-check` | Monitors disk space, alerts on thresholds | Hourly |
| `weekly-report` | Generates system health summary | Sunday 10 AM |
| `ssd-check` | Monitors SSD health | Periodic |
| `mana-stt` | Speech-to-text service (Whisper) | At login |
| `mana-tts` | Text-to-speech service (Kokoro) | At login |

View file

@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.manacore.disk-check</string>
<key>ProgramArguments</key>
<array>
<string>/bin/bash</string>
<string>/Users/mana/projects/manacore-monorepo/scripts/mac-mini/check-disk-space.sh</string>
</array>
<!-- Run hourly -->
<key>StartInterval</key>
<integer>3600</integer>
<!-- Also run at startup -->
<key>RunAtLoad</key>
<true/>
<key>StandardOutPath</key>
<string>/tmp/manacore-disk-check.log</string>
<key>StandardErrorPath</key>
<string>/tmp/manacore-disk-check.error.log</string>
<key>EnvironmentVariables</key>
<dict>
<key>PATH</key>
<string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin</string>
</dict>
</dict>
</plist>

View file

@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.manacore.weekly-report</string>
<key>ProgramArguments</key>
<array>
<string>/bin/bash</string>
<string>/Users/mana/projects/manacore-monorepo/scripts/mac-mini/weekly-report.sh</string>
</array>
<!-- Run every Sunday at 10:00 AM -->
<key>StartCalendarInterval</key>
<dict>
<key>Weekday</key>
<integer>0</integer>
<key>Hour</key>
<integer>10</integer>
<key>Minute</key>
<integer>0</integer>
</dict>
<key>StandardOutPath</key>
<string>/tmp/manacore-weekly-report.log</string>
<key>StandardErrorPath</key>
<string>/tmp/manacore-weekly-report.error.log</string>
<key>EnvironmentVariables</key>
<dict>
<key>PATH</key>
<string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin</string>
</dict>
</dict>
</plist>

309
scripts/mac-mini/weekly-report.sh Executable file
View file

@ -0,0 +1,309 @@
#!/bin/bash
# ManaCore Weekly Maintenance Report
# Generates a comprehensive system health summary
#
# Includes:
# - Backup status
# - Disk usage
# - Container health & restart counts
# - Database statistics
# - Error log summary
#
# Run via LaunchD every Sunday at 10:00 AM
set -e
# Ensure PATH includes required tools
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
LOG_FILE="/tmp/manacore-weekly-report.log"
REPORT_FILE="/tmp/manacore-weekly-report.txt"
# Load notification config if exists
if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
source "$PROJECT_ROOT/.env.notifications"
fi
# Load env for database password
if [ -f "$PROJECT_ROOT/.env.macmini" ]; then
source "$PROJECT_ROOT/.env.macmini"
fi
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
send_notification() {
local message="$1"
# Telegram
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" \
-d "text=${message}" \
-d "parse_mode=HTML" \
>/dev/null 2>&1 || true
fi
}
# Initialize report
init_report() {
cat > "$REPORT_FILE" << EOF
📊 <b>ManaCore Weekly Report</b>
$(date '+%Y-%m-%d %H:%M')
━━━━━━━━━━━━━━━━━━━━━━
EOF
}
# Add section to report
add_section() {
local title="$1"
echo "" >> "$REPORT_FILE"
echo "<b>$title</b>" >> "$REPORT_FILE"
}
# Check backup status
check_backups() {
add_section "💾 Backup Status"
local backup_dir="/Volumes/ManaData/backups/postgres"
if [ ! -d "$backup_dir" ]; then
echo "⚠️ Backup directory not found" >> "$REPORT_FILE"
return
fi
# Count recent backups
local daily_count
daily_count=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime -7 2>/dev/null | wc -l | tr -d ' ')
local weekly_count
weekly_count=$(find "$backup_dir/weekly" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
# Get latest backup
local latest_backup
latest_backup=$(ls -t "$backup_dir/daily"/*.sql.gz 2>/dev/null | head -1)
local latest_date=""
if [ -n "$latest_backup" ]; then
latest_date=$(stat -f "%Sm" -t "%Y-%m-%d %H:%M" "$latest_backup" 2>/dev/null || stat -c "%y" "$latest_backup" 2>/dev/null | cut -d'.' -f1)
fi
# Get total backup size
local total_size
total_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}')
echo "Daily backups (7 days): $daily_count" >> "$REPORT_FILE"
echo "Weekly backups: $weekly_count" >> "$REPORT_FILE"
echo "Latest: $latest_date" >> "$REPORT_FILE"
echo "Total size: $total_size" >> "$REPORT_FILE"
# Verify backup integrity (check if files are not empty)
local empty_backups
empty_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -size 0 2>/dev/null | wc -l | tr -d ' ')
if [ "$empty_backups" -gt 0 ]; then
echo "⚠️ $empty_backups empty backup files found!" >> "$REPORT_FILE"
fi
}
# Check disk usage
check_disk_usage() {
add_section "💿 Disk Usage"
# System disk
local system_usage
system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}')
echo "System: $system_usage" >> "$REPORT_FILE"
# ManaData SSD
if [ -d "/Volumes/ManaData" ]; then
local data_usage
data_usage=$(df -h "/Volumes/ManaData" 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}')
echo "ManaData: $data_usage" >> "$REPORT_FILE"
fi
# Docker disk usage
if docker info >/dev/null 2>&1; then
local docker_images
docker_images=$(docker system df --format '{{.Type}}: {{.Size}}' 2>/dev/null | head -3 | tr '\n' ', ' | sed 's/,$//')
echo "Docker: $docker_images" >> "$REPORT_FILE"
fi
}
# Check container health
check_containers() {
add_section "🐳 Container Health"
if ! docker info >/dev/null 2>&1; then
echo "⚠️ Docker not running" >> "$REPORT_FILE"
return
fi
# Count containers by status
local running
running=$(docker ps -q 2>/dev/null | wc -l | tr -d ' ')
local total
total=$(docker ps -aq 2>/dev/null | wc -l | tr -d ' ')
local healthy
healthy=$(docker ps --filter "health=healthy" -q 2>/dev/null | wc -l | tr -d ' ')
local unhealthy
unhealthy=$(docker ps --filter "health=unhealthy" -q 2>/dev/null | wc -l | tr -d ' ')
echo "Running: $running / $total" >> "$REPORT_FILE"
echo "Healthy: $healthy" >> "$REPORT_FILE"
if [ "$unhealthy" -gt 0 ]; then
echo "⚠️ Unhealthy: $unhealthy" >> "$REPORT_FILE"
# List unhealthy containers
docker ps --filter "health=unhealthy" --format " - {{.Names}}" 2>/dev/null >> "$REPORT_FILE"
fi
# Get containers with most restarts this week
echo "" >> "$REPORT_FILE"
echo "Top restarts:" >> "$REPORT_FILE"
docker ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null | \
grep -E "mana-" | \
while read name status; do
local restarts
restarts=$(docker inspect "$name" --format '{{.RestartCount}}' 2>/dev/null || echo "0")
if [ "$restarts" -gt 0 ]; then
echo " $name: $restarts" >> "$REPORT_FILE"
fi
done
# Check for any container that restarted in the last week
local recent_restarts=0
for container in $(docker ps -aq 2>/dev/null); do
local restart_count
restart_count=$(docker inspect "$container" --format '{{.RestartCount}}' 2>/dev/null || echo "0")
if [ "$restart_count" -gt 0 ]; then
recent_restarts=$((recent_restarts + restart_count))
fi
done
if [ "$recent_restarts" -eq 0 ]; then
echo " None (stable week!)" >> "$REPORT_FILE"
fi
}
# Check database health
check_database() {
add_section "🗄️ Database"
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "mana-infra-postgres"; then
echo "⚠️ PostgreSQL not running" >> "$REPORT_FILE"
return
fi
# Get database sizes
local db_sizes
db_sizes=$(docker exec mana-infra-postgres psql -U postgres -t -c "
SELECT datname, pg_size_pretty(pg_database_size(datname))
FROM pg_database
WHERE datistemplate = false AND datname != 'postgres'
ORDER BY pg_database_size(datname) DESC
LIMIT 5;
" 2>/dev/null | grep -v "^$" || echo "Could not fetch")
echo "Database sizes:" >> "$REPORT_FILE"
echo "$db_sizes" | while read line; do
[ -n "$line" ] && echo " $line" >> "$REPORT_FILE"
done
# Get total connection count
local connections
connections=$(docker exec mana-infra-postgres psql -U postgres -t -c "
SELECT count(*) FROM pg_stat_activity WHERE state = 'active';
" 2>/dev/null | tr -d ' ' || echo "?")
echo "Active connections: $connections" >> "$REPORT_FILE"
}
# Check for errors in logs
check_errors() {
add_section "⚠️ Recent Errors"
local error_count=0
local containers_with_errors=""
# Check each mana container for errors in the last 24h
for container in $(docker ps --format '{{.Names}}' 2>/dev/null | grep "^mana-"); do
local errors
errors=$(docker logs "$container" --since 168h 2>&1 | grep -ci "error\|exception\|fatal" || echo "0")
if [ "$errors" -gt 10 ]; then
containers_with_errors="$containers_with_errors\n $container: $errors errors"
error_count=$((error_count + errors))
fi
done
if [ -z "$containers_with_errors" ]; then
echo "No significant errors in the last week" >> "$REPORT_FILE"
else
echo "Total errors: $error_count" >> "$REPORT_FILE"
echo -e "$containers_with_errors" >> "$REPORT_FILE"
fi
}
# Check uptime and system resources
check_system() {
add_section "🖥️ System"
# System uptime
local uptime_str
uptime_str=$(uptime | sed 's/.*up //' | sed 's/,.*//')
echo "Uptime: $uptime_str" >> "$REPORT_FILE"
# Load average
local load
load=$(uptime | sed 's/.*load averages: //' | awk '{print $1 " " $2 " " $3}')
echo "Load: $load" >> "$REPORT_FILE"
# Memory (macOS specific)
if command -v vm_stat &> /dev/null; then
local pages_free
pages_free=$(vm_stat | grep "Pages free" | awk '{print $3}' | tr -d '.')
local pages_active
pages_active=$(vm_stat | grep "Pages active" | awk '{print $3}' | tr -d '.')
# Calculate rough memory usage (page size is 16384 on Apple Silicon, 4096 on Intel)
local page_size
page_size=$(pagesize 2>/dev/null || echo "16384")
local mem_free_gb
mem_free_gb=$(echo "scale=1; $pages_free * $page_size / 1024 / 1024 / 1024" | bc 2>/dev/null || echo "?")
echo "Memory free: ~${mem_free_gb}GB" >> "$REPORT_FILE"
fi
}
# Generate summary
generate_summary() {
echo "" >> "$REPORT_FILE"
echo "━━━━━━━━━━━━━━━━━━━━━━" >> "$REPORT_FILE"
echo "<i>Generated by ManaCore</i>" >> "$REPORT_FILE"
}
# Main execution
log "=== Generating Weekly Report ==="
init_report
check_backups
check_disk_usage
check_containers
check_database
check_errors
check_system
generate_summary
log "Report generated at $REPORT_FILE"
# Send report via Telegram
REPORT_CONTENT=$(cat "$REPORT_FILE")
send_notification "$REPORT_CONTENT"
log "Report sent via Telegram"
log "=== Weekly Report Complete ==="