From acc8de36ee1414b01bc5b1b674cc19a6ffa8ab14 Mon Sep 17 00:00:00 2001 From: Till-JS <101404291+Till-JS@users.noreply.github.com> Date: Thu, 12 Feb 2026 13:46:57 +0100 Subject: [PATCH] feat(monitoring): add alerting stack and maintenance scripts Medium priority stability improvements: Alerting: - Add vmalert for evaluating Prometheus alert rules - Add alertmanager for alert routing and grouping - Add alert-notifier service for Telegram/ntfy notifications - Enable cadvisor scraping in prometheus config Disk Monitoring: - Add check-disk-space.sh for hourly disk monitoring - Alert on 80% (warning) and 90% (critical) thresholds - Auto-cleanup Docker when disk is critical - Add com.manacore.disk-check.plist for LaunchD Weekly Reports: - Add weekly-report.sh for system health summary - Includes: backup status, disk usage, container health, database stats, error log summary - Runs every Sunday at 10 AM via LaunchD Health Check Updates: - Add checks for vmalert, alertmanager, alert-notifier Co-Authored-By: Claude Opus 4.5 --- docker-compose.macmini.yml | 81 +++++ docker/alert-notifier/Dockerfile | 17 + docker/alert-notifier/main.py | 204 ++++++++++++ docker/alertmanager/alertmanager.yml | 62 ++++ docker/prometheus/prometheus.yml | 18 +- scripts/mac-mini/check-disk-space.sh | 233 +++++++++++++ scripts/mac-mini/health-check.sh | 6 + scripts/mac-mini/launchd/README.md | 5 +- .../launchd/com.manacore.disk-check.plist | 34 ++ .../launchd/com.manacore.weekly-report.plist | 37 +++ scripts/mac-mini/weekly-report.sh | 309 ++++++++++++++++++ 11 files changed, 996 insertions(+), 10 deletions(-) create mode 100644 docker/alert-notifier/Dockerfile create mode 100644 docker/alert-notifier/main.py create mode 100644 docker/alertmanager/alertmanager.yml create mode 100755 scripts/mac-mini/check-disk-space.sh create mode 100644 scripts/mac-mini/launchd/com.manacore.disk-check.plist create mode 100644 scripts/mac-mini/launchd/com.manacore.weekly-report.plist create mode 100755 scripts/mac-mini/weekly-report.sh diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index 0a4044e11..87afdb919 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -1431,6 +1431,11 @@ services: - /dev/disk/:/dev/disk:ro ports: - "9110:8080" + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"] + interval: 30s + timeout: 10s + retries: 3 postgres-exporter: image: prometheuscommunity/postgres-exporter:v0.15.0 @@ -1481,6 +1486,80 @@ services: timeout: 10s retries: 3 + # ============================================ + # Alerting Stack (Ports 9093-9095) + # ============================================ + + vmalert: + image: victoriametrics/vmalert:v1.99.0 + container_name: mana-mon-vmalert + restart: always + depends_on: + victoriametrics: + condition: service_healthy + alertmanager: + condition: service_healthy + command: + - '-datasource.url=http://victoriametrics:9090' + - '-notifier.url=http://alertmanager:9093' + - '-remoteWrite.url=http://victoriametrics:9090' + - '-remoteRead.url=http://victoriametrics:9090' + - '-rule=/etc/alerts/*.yml' + - '-evaluationInterval=30s' + - '-httpListenAddr=:8880' + volumes: + - ./docker/prometheus/alerts.yml:/etc/alerts/alerts.yml:ro + ports: + - "8880:8880" + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8880/health"] + interval: 30s + timeout: 10s + retries: 3 + + alertmanager: + image: prom/alertmanager:v0.27.0 + container_name: mana-mon-alertmanager + restart: always + depends_on: + alert-notifier: + condition: service_healthy + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + - '--web.listen-address=:9093' + volumes: + - ./docker/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - alertmanager_data:/alertmanager + ports: + - "9093:9093" + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9093/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + + alert-notifier: + build: + context: ./docker/alert-notifier + dockerfile: Dockerfile + image: alert-notifier:local + container_name: mana-mon-alert-notifier + restart: always + environment: + PORT: 8080 + TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN:-} + TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID:-} + NTFY_TOPIC: ${NTFY_TOPIC:-} + ports: + - "9095:8080" + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 5s + # ============================================ # Auto-Update (Watchtower) # ============================================ @@ -1508,6 +1587,8 @@ volumes: name: mana-redis-data victoriametrics_data: name: mana-victoria-data + alertmanager_data: + name: mana-alertmanager-data grafana_data: name: mana-grafana-data analytics_data: diff --git a/docker/alert-notifier/Dockerfile b/docker/alert-notifier/Dockerfile new file mode 100644 index 000000000..cb045d43b --- /dev/null +++ b/docker/alert-notifier/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.12-alpine + +WORKDIR /app + +COPY main.py . + +# No dependencies needed - uses only stdlib +RUN chmod +x main.py + +EXPOSE 8080 + +ENV PORT=8080 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \ + CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:8080/health || exit 1 + +CMD ["python", "main.py"] diff --git a/docker/alert-notifier/main.py b/docker/alert-notifier/main.py new file mode 100644 index 000000000..f7cc6fcb5 --- /dev/null +++ b/docker/alert-notifier/main.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +""" +Alert Notifier - Webhook receiver for Alertmanager +Forwards alerts to Telegram and ntfy + +Environment Variables: + TELEGRAM_BOT_TOKEN - Telegram bot token + TELEGRAM_CHAT_ID - Telegram chat ID + NTFY_TOPIC - ntfy.sh topic name (optional) +""" + +import os +import json +import logging +from http.server import HTTPServer, BaseHTTPRequestHandler +import urllib.request +import urllib.parse + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +TELEGRAM_BOT_TOKEN = os.environ.get('TELEGRAM_BOT_TOKEN', '') +TELEGRAM_CHAT_ID = os.environ.get('TELEGRAM_CHAT_ID', '') +NTFY_TOPIC = os.environ.get('NTFY_TOPIC', '') + +SEVERITY_EMOJI = { + 'critical': '🚨', + 'warning': 'âš ī¸', + 'info': 'â„šī¸', +} + +def format_alert_telegram(alert: dict, status: str) -> str: + """Format a single alert for Telegram.""" + labels = alert.get('labels', {}) + annotations = alert.get('annotations', {}) + + severity = labels.get('severity', 'unknown') + emoji = SEVERITY_EMOJI.get(severity, '🔔') + + if status == 'resolved': + emoji = '✅' + + alertname = labels.get('alertname', 'Unknown') + job = labels.get('job', '') + summary = annotations.get('summary', alertname) + description = annotations.get('description', '') + + msg = f"{emoji} {status.upper()}: {summary}\n" + if job: + msg += f"Service: {job}\n" + if description: + msg += f"{description}\n" + + return msg + +def send_telegram(message: str) -> bool: + """Send message to Telegram.""" + if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID: + logger.warning("Telegram not configured") + return False + + url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage" + data = { + 'chat_id': TELEGRAM_CHAT_ID, + 'text': message, + 'parse_mode': 'HTML', + 'disable_web_page_preview': True + } + + try: + req = urllib.request.Request( + url, + data=urllib.parse.urlencode(data).encode(), + headers={'Content-Type': 'application/x-www-form-urlencoded'} + ) + with urllib.request.urlopen(req, timeout=10) as resp: + return resp.status == 200 + except Exception as e: + logger.error(f"Telegram send failed: {e}") + return False + +def send_ntfy(title: str, message: str, priority: str = 'default') -> bool: + """Send message to ntfy.""" + if not NTFY_TOPIC: + return False + + url = f"https://ntfy.sh/{NTFY_TOPIC}" + + priority_map = { + 'critical': 'urgent', + 'warning': 'high', + 'info': 'low' + } + ntfy_priority = priority_map.get(priority, 'default') + + try: + req = urllib.request.Request( + url, + data=message.encode('utf-8'), + headers={ + 'Title': title, + 'Priority': ntfy_priority, + 'Tags': 'warning' if priority == 'critical' else 'loudspeaker' + } + ) + with urllib.request.urlopen(req, timeout=10) as resp: + return resp.status == 200 + except Exception as e: + logger.error(f"ntfy send failed: {e}") + return False + +class AlertHandler(BaseHTTPRequestHandler): + def do_POST(self): + if self.path != '/webhook': + self.send_response(404) + self.end_headers() + return + + content_length = int(self.headers.get('Content-Length', 0)) + body = self.rfile.read(content_length) + + try: + payload = json.loads(body) + self.process_alerts(payload) + self.send_response(200) + self.end_headers() + self.wfile.write(b'OK') + except Exception as e: + logger.error(f"Error processing webhook: {e}") + self.send_response(500) + self.end_headers() + self.wfile.write(str(e).encode()) + + def do_GET(self): + if self.path == '/health': + self.send_response(200) + self.end_headers() + self.wfile.write(b'OK') + else: + self.send_response(404) + self.end_headers() + + def process_alerts(self, payload: dict): + """Process Alertmanager webhook payload.""" + status = payload.get('status', 'unknown') + alerts = payload.get('alerts', []) + + if not alerts: + return + + logger.info(f"Received {len(alerts)} alerts with status: {status}") + + # Build message + messages = [] + highest_severity = 'info' + + for alert in alerts: + msg = format_alert_telegram(alert, alert.get('status', status)) + messages.append(msg) + + severity = alert.get('labels', {}).get('severity', 'info') + if severity == 'critical': + highest_severity = 'critical' + elif severity == 'warning' and highest_severity != 'critical': + highest_severity = 'warning' + + combined_message = '\n'.join(messages) + + # Send notifications + if TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID: + success = send_telegram(combined_message) + logger.info(f"Telegram: {'sent' if success else 'failed'}") + + if NTFY_TOPIC: + title = f"ManaCore Alert ({len(alerts)} alerts)" + # Strip HTML for ntfy + plain_message = combined_message.replace('', '').replace('', '') + plain_message = plain_message.replace('', '').replace('', '') + success = send_ntfy(title, plain_message, highest_severity) + logger.info(f"ntfy: {'sent' if success else 'failed'}") + + def log_message(self, format, *args): + logger.info(f"{self.client_address[0]} - {format % args}") + +def main(): + port = int(os.environ.get('PORT', 8080)) + + logger.info(f"Starting Alert Notifier on port {port}") + logger.info(f"Telegram configured: {bool(TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID)}") + logger.info(f"ntfy configured: {bool(NTFY_TOPIC)}") + + server = HTTPServer(('0.0.0.0', port), AlertHandler) + + try: + server.serve_forever() + except KeyboardInterrupt: + logger.info("Shutting down") + server.shutdown() + +if __name__ == '__main__': + main() diff --git a/docker/alertmanager/alertmanager.yml b/docker/alertmanager/alertmanager.yml new file mode 100644 index 000000000..98d0cfd19 --- /dev/null +++ b/docker/alertmanager/alertmanager.yml @@ -0,0 +1,62 @@ +# Alertmanager Configuration for ManaCore +# Sends alerts via webhook to custom notification handler + +global: + resolve_timeout: 5m + +route: + # Default receiver for all alerts + receiver: 'webhook' + # Group alerts by severity and service + group_by: ['alertname', 'severity', 'job'] + # Wait before sending first notification + group_wait: 30s + # Wait before sending follow-up notifications for same group + group_interval: 5m + # Wait before re-sending resolved alerts + repeat_interval: 4h + + routes: + # Critical alerts - immediate notification + - match: + severity: critical + receiver: 'webhook' + group_wait: 10s + repeat_interval: 1h + + # Warning alerts - less frequent + - match: + severity: warning + receiver: 'webhook' + group_wait: 1m + repeat_interval: 6h + + # Info alerts - only during business hours, batch together + - match: + severity: info + receiver: 'webhook' + group_wait: 5m + repeat_interval: 24h + +receivers: + - name: 'webhook' + webhook_configs: + - url: 'http://alert-notifier:8080/webhook' + send_resolved: true + max_alerts: 10 + +# Inhibition rules - prevent redundant alerts +inhibit_rules: + # Don't alert on warnings if critical is firing for same service + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'job'] + + # Don't alert on service-specific issues if PostgreSQL is down + - source_match: + alertname: 'PostgreSQLDown' + target_match_re: + alertname: '.*(Backend|Service).*' + equal: [] diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 4dcf8f5b6..a53e9d4d9 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -9,11 +9,11 @@ global: rule_files: - /etc/prometheus/alerts.yml -# Alertmanager configuration (optional, for future use) -# alerting: -# alertmanagers: -# - static_configs: -# - targets: [] +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] scrape_configs: # Prometheus self-monitoring @@ -30,10 +30,10 @@ scrape_configs: target_label: instance replacement: 'mac-mini' - # Docker container metrics via cAdvisor (disabled - container not deployed) - # - job_name: 'cadvisor' - # static_configs: - # - targets: ['cadvisor:8080'] + # Docker container metrics via cAdvisor + - job_name: 'cadvisor' + static_configs: + - targets: ['cadvisor:8080'] # PostgreSQL metrics - job_name: 'postgres' diff --git a/scripts/mac-mini/check-disk-space.sh b/scripts/mac-mini/check-disk-space.sh new file mode 100755 index 000000000..5b805470a --- /dev/null +++ b/scripts/mac-mini/check-disk-space.sh @@ -0,0 +1,233 @@ +#!/bin/bash +# ManaCore Disk Space Monitor +# Checks disk usage on system and data volumes +# Alerts via Telegram/ntfy when thresholds are exceeded +# +# Thresholds: +# - Warning: 80% +# - Critical: 90% +# +# Run via LaunchD hourly + +set -e + +# Ensure PATH includes required tools +export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +LOG_FILE="/tmp/manacore-disk-check.log" + +# Thresholds +WARNING_THRESHOLD=80 +CRITICAL_THRESHOLD=90 + +# Load notification config if exists +if [ -f "$PROJECT_ROOT/.env.notifications" ]; then + source "$PROJECT_ROOT/.env.notifications" +fi + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +send_notification() { + local message="$1" + local priority="${2:-default}" + + # Telegram + if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then + curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ + -d "chat_id=${TELEGRAM_CHAT_ID}" \ + -d "text=${message}" \ + -d "parse_mode=HTML" \ + >/dev/null 2>&1 || true + fi + + # ntfy + if [ -n "$NTFY_TOPIC" ]; then + local ntfy_priority="default" + [ "$priority" = "high" ] && ntfy_priority="high" + [ "$priority" = "critical" ] && ntfy_priority="urgent" + + curl -s -d "$message" \ + -H "Title: ManaCore Disk Alert" \ + -H "Priority: $ntfy_priority" \ + -H "Tags: warning" \ + "https://ntfy.sh/$NTFY_TOPIC" >/dev/null 2>&1 || true + fi +} + +check_disk() { + local mount_point="$1" + local name="$2" + + # Check if mount point exists + if [ ! -d "$mount_point" ]; then + log "WARNING: Mount point $mount_point does not exist" + return 1 + fi + + # Get disk usage percentage (macOS compatible) + local usage + usage=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}') + + if [ -z "$usage" ]; then + log "ERROR: Could not get disk usage for $mount_point" + return 1 + fi + + # Get available space + local available + available=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}') + + log "$name: ${usage}% used (${available} free)" + + # Check thresholds + if [ "$usage" -ge "$CRITICAL_THRESHOLD" ]; then + log "CRITICAL: $name at ${usage}%!" + send_notification "🚨 CRITICAL: Disk Space + +$name is at ${usage}% +Available: ${available} + +Immediate action required!" "critical" + return 2 + elif [ "$usage" -ge "$WARNING_THRESHOLD" ]; then + log "WARNING: $name at ${usage}%" + send_notification "âš ī¸ WARNING: Disk Space + +$name is at ${usage}% +Available: ${available} + +Consider cleaning up old files." "high" + return 1 + fi + + return 0 +} + +check_docker_disk() { + # Check Docker disk usage + if ! command -v docker &> /dev/null; then + log "Docker not found in PATH" + return 0 + fi + + if ! docker info >/dev/null 2>&1; then + log "Docker is not running" + return 0 + fi + + # Get Docker disk usage + local docker_usage + docker_usage=$(docker system df --format '{{.Size}}' 2>/dev/null | head -1) + + log "Docker disk usage: $docker_usage" + + # Check for dangling images and unused volumes + local dangling_images + dangling_images=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ') + + local unused_volumes + unused_volumes=$(docker volume ls -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ') + + if [ "$dangling_images" -gt 10 ] || [ "$unused_volumes" -gt 5 ]; then + log "Docker cleanup recommended: $dangling_images dangling images, $unused_volumes unused volumes" + + # Auto-cleanup if critical + local system_usage + system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}') + + if [ "$system_usage" -ge "$CRITICAL_THRESHOLD" ]; then + log "Running docker system prune due to critical disk usage..." + docker system prune -f --volumes 2>/dev/null || true + log "Docker cleanup completed" + fi + fi +} + +check_postgres_backups() { + local backup_dir="/Volumes/ManaData/backups/postgres" + + if [ ! -d "$backup_dir" ]; then + return 0 + fi + + # Get backup directory size + local backup_size + backup_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}') + + log "PostgreSQL backups: $backup_size" + + # Count old backups (older than 30 days in daily folder) + local old_backups + old_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime +30 2>/dev/null | wc -l | tr -d ' ') + + if [ "$old_backups" -gt 0 ]; then + log "Note: $old_backups old daily backups could be cleaned up" + fi +} + +check_docker_logs() { + # Check for large Docker log files + local docker_logs_dir="/var/lib/docker/containers" + + # On macOS with Docker Desktop, logs are in the VM + # We can check via docker inspect instead + if ! docker info >/dev/null 2>&1; then + return 0 + fi + + # Get containers with largest log sizes + local large_logs=0 + for container in $(docker ps -q 2>/dev/null); do + local log_size + log_size=$(docker inspect "$container" --format '{{.LogPath}}' 2>/dev/null | xargs -I {} docker run --rm -v /var/lib/docker:/var/lib/docker:ro alpine stat -c%s {} 2>/dev/null || echo "0") + + # Convert to MB (if size > 100MB, flag it) + if [ "$log_size" -gt 104857600 ] 2>/dev/null; then + local container_name + container_name=$(docker inspect "$container" --format '{{.Name}}' 2>/dev/null | tr -d '/') + log "Large log file: $container_name ($(($log_size / 1048576))MB)" + large_logs=$((large_logs + 1)) + fi + done 2>/dev/null || true + + if [ "$large_logs" -gt 0 ]; then + log "Found $large_logs containers with large log files" + fi +} + +# Main execution +log "=== ManaCore Disk Space Check ===" + +ALERT_STATUS=0 + +# Check system disk +check_disk "/" "System Disk" || ALERT_STATUS=$? + +# Check ManaData volume (external SSD) +if [ -d "/Volumes/ManaData" ]; then + check_disk "/Volumes/ManaData" "ManaData SSD" || ALERT_STATUS=$? +fi + +# Check Docker disk usage +check_docker_disk + +# Check backup sizes +check_postgres_backups + +# Summary +log "=== Check Complete ===" + +if [ "$ALERT_STATUS" -eq 2 ]; then + log "Status: CRITICAL - Immediate action required" + exit 2 +elif [ "$ALERT_STATUS" -eq 1 ]; then + log "Status: WARNING - Attention needed" + exit 1 +else + log "Status: OK - All disks within thresholds" + exit 0 +fi diff --git a/scripts/mac-mini/health-check.sh b/scripts/mac-mini/health-check.sh index e6296d9c2..a4342df73 100755 --- a/scripts/mac-mini/health-check.sh +++ b/scripts/mac-mini/health-check.sh @@ -275,6 +275,12 @@ check_service "Grafana" "http://localhost:8000/api/health" check_service "Umami" "http://localhost:8010/api/heartbeat" check_service "VictoriaMetrics" "http://localhost:9090/health" +echo "" +echo "Alerting:" +check_service "vmalert" "http://localhost:8880/health" +check_service "Alertmanager" "http://localhost:9093/-/healthy" +check_service "Alert Notifier" "http://localhost:9095/health" + echo "" echo "Cloudflare Tunnel:" if pgrep -x "cloudflared" >/dev/null; then diff --git a/scripts/mac-mini/launchd/README.md b/scripts/mac-mini/launchd/README.md index 57a4b26dc..b294a81eb 100644 --- a/scripts/mac-mini/launchd/README.md +++ b/scripts/mac-mini/launchd/README.md @@ -17,8 +17,11 @@ for f in *.plist; do launchctl load ~/Library/LaunchAgents/$f; done | Service | Description | Interval | |---------|-------------|----------| | `docker-startup` | Starts Docker containers on boot | At login | -| `ensure-containers` | Detects and restarts stuck containers | Every 5 min | +| `ensure-containers` | Detects and restarts stuck/crash-looping containers | Every 5 min | | `health-check` | Checks all services and sends alerts | Every 5 min | +| `backup-databases` | PostgreSQL backup with daily/weekly rotation | Daily 3 AM | +| `disk-check` | Monitors disk space, alerts on thresholds | Hourly | +| `weekly-report` | Generates system health summary | Sunday 10 AM | | `ssd-check` | Monitors SSD health | Periodic | | `mana-stt` | Speech-to-text service (Whisper) | At login | | `mana-tts` | Text-to-speech service (Kokoro) | At login | diff --git a/scripts/mac-mini/launchd/com.manacore.disk-check.plist b/scripts/mac-mini/launchd/com.manacore.disk-check.plist new file mode 100644 index 000000000..35f7d31a9 --- /dev/null +++ b/scripts/mac-mini/launchd/com.manacore.disk-check.plist @@ -0,0 +1,34 @@ + + + + + Label + com.manacore.disk-check + + ProgramArguments + + /bin/bash + /Users/mana/projects/manacore-monorepo/scripts/mac-mini/check-disk-space.sh + + + + StartInterval + 3600 + + + RunAtLoad + + + StandardOutPath + /tmp/manacore-disk-check.log + + StandardErrorPath + /tmp/manacore-disk-check.error.log + + EnvironmentVariables + + PATH + /usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin + + + diff --git a/scripts/mac-mini/launchd/com.manacore.weekly-report.plist b/scripts/mac-mini/launchd/com.manacore.weekly-report.plist new file mode 100644 index 000000000..c1abf0e1a --- /dev/null +++ b/scripts/mac-mini/launchd/com.manacore.weekly-report.plist @@ -0,0 +1,37 @@ + + + + + Label + com.manacore.weekly-report + + ProgramArguments + + /bin/bash + /Users/mana/projects/manacore-monorepo/scripts/mac-mini/weekly-report.sh + + + + StartCalendarInterval + + Weekday + 0 + Hour + 10 + Minute + 0 + + + StandardOutPath + /tmp/manacore-weekly-report.log + + StandardErrorPath + /tmp/manacore-weekly-report.error.log + + EnvironmentVariables + + PATH + /usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin + + + diff --git a/scripts/mac-mini/weekly-report.sh b/scripts/mac-mini/weekly-report.sh new file mode 100755 index 000000000..b2dcf3ae9 --- /dev/null +++ b/scripts/mac-mini/weekly-report.sh @@ -0,0 +1,309 @@ +#!/bin/bash +# ManaCore Weekly Maintenance Report +# Generates a comprehensive system health summary +# +# Includes: +# - Backup status +# - Disk usage +# - Container health & restart counts +# - Database statistics +# - Error log summary +# +# Run via LaunchD every Sunday at 10:00 AM + +set -e + +# Ensure PATH includes required tools +export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +LOG_FILE="/tmp/manacore-weekly-report.log" +REPORT_FILE="/tmp/manacore-weekly-report.txt" + +# Load notification config if exists +if [ -f "$PROJECT_ROOT/.env.notifications" ]; then + source "$PROJECT_ROOT/.env.notifications" +fi + +# Load env for database password +if [ -f "$PROJECT_ROOT/.env.macmini" ]; then + source "$PROJECT_ROOT/.env.macmini" +fi + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +send_notification() { + local message="$1" + + # Telegram + if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then + curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ + -d "chat_id=${TELEGRAM_CHAT_ID}" \ + -d "text=${message}" \ + -d "parse_mode=HTML" \ + >/dev/null 2>&1 || true + fi +} + +# Initialize report +init_report() { + cat > "$REPORT_FILE" << EOF +📊 ManaCore Weekly Report +$(date '+%Y-%m-%d %H:%M') +━━━━━━━━━━━━━━━━━━━━━━ + +EOF +} + +# Add section to report +add_section() { + local title="$1" + echo "" >> "$REPORT_FILE" + echo "$title" >> "$REPORT_FILE" +} + +# Check backup status +check_backups() { + add_section "💾 Backup Status" + + local backup_dir="/Volumes/ManaData/backups/postgres" + + if [ ! -d "$backup_dir" ]; then + echo "âš ī¸ Backup directory not found" >> "$REPORT_FILE" + return + fi + + # Count recent backups + local daily_count + daily_count=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime -7 2>/dev/null | wc -l | tr -d ' ') + + local weekly_count + weekly_count=$(find "$backup_dir/weekly" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ') + + # Get latest backup + local latest_backup + latest_backup=$(ls -t "$backup_dir/daily"/*.sql.gz 2>/dev/null | head -1) + local latest_date="" + if [ -n "$latest_backup" ]; then + latest_date=$(stat -f "%Sm" -t "%Y-%m-%d %H:%M" "$latest_backup" 2>/dev/null || stat -c "%y" "$latest_backup" 2>/dev/null | cut -d'.' -f1) + fi + + # Get total backup size + local total_size + total_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}') + + echo "Daily backups (7 days): $daily_count" >> "$REPORT_FILE" + echo "Weekly backups: $weekly_count" >> "$REPORT_FILE" + echo "Latest: $latest_date" >> "$REPORT_FILE" + echo "Total size: $total_size" >> "$REPORT_FILE" + + # Verify backup integrity (check if files are not empty) + local empty_backups + empty_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -size 0 2>/dev/null | wc -l | tr -d ' ') + if [ "$empty_backups" -gt 0 ]; then + echo "âš ī¸ $empty_backups empty backup files found!" >> "$REPORT_FILE" + fi +} + +# Check disk usage +check_disk_usage() { + add_section "đŸ’ŋ Disk Usage" + + # System disk + local system_usage + system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}') + echo "System: $system_usage" >> "$REPORT_FILE" + + # ManaData SSD + if [ -d "/Volumes/ManaData" ]; then + local data_usage + data_usage=$(df -h "/Volumes/ManaData" 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}') + echo "ManaData: $data_usage" >> "$REPORT_FILE" + fi + + # Docker disk usage + if docker info >/dev/null 2>&1; then + local docker_images + docker_images=$(docker system df --format '{{.Type}}: {{.Size}}' 2>/dev/null | head -3 | tr '\n' ', ' | sed 's/,$//') + echo "Docker: $docker_images" >> "$REPORT_FILE" + fi +} + +# Check container health +check_containers() { + add_section "đŸŗ Container Health" + + if ! docker info >/dev/null 2>&1; then + echo "âš ī¸ Docker not running" >> "$REPORT_FILE" + return + fi + + # Count containers by status + local running + running=$(docker ps -q 2>/dev/null | wc -l | tr -d ' ') + + local total + total=$(docker ps -aq 2>/dev/null | wc -l | tr -d ' ') + + local healthy + healthy=$(docker ps --filter "health=healthy" -q 2>/dev/null | wc -l | tr -d ' ') + + local unhealthy + unhealthy=$(docker ps --filter "health=unhealthy" -q 2>/dev/null | wc -l | tr -d ' ') + + echo "Running: $running / $total" >> "$REPORT_FILE" + echo "Healthy: $healthy" >> "$REPORT_FILE" + + if [ "$unhealthy" -gt 0 ]; then + echo "âš ī¸ Unhealthy: $unhealthy" >> "$REPORT_FILE" + # List unhealthy containers + docker ps --filter "health=unhealthy" --format " - {{.Names}}" 2>/dev/null >> "$REPORT_FILE" + fi + + # Get containers with most restarts this week + echo "" >> "$REPORT_FILE" + echo "Top restarts:" >> "$REPORT_FILE" + + docker ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null | \ + grep -E "mana-" | \ + while read name status; do + local restarts + restarts=$(docker inspect "$name" --format '{{.RestartCount}}' 2>/dev/null || echo "0") + if [ "$restarts" -gt 0 ]; then + echo " $name: $restarts" >> "$REPORT_FILE" + fi + done + + # Check for any container that restarted in the last week + local recent_restarts=0 + for container in $(docker ps -aq 2>/dev/null); do + local restart_count + restart_count=$(docker inspect "$container" --format '{{.RestartCount}}' 2>/dev/null || echo "0") + if [ "$restart_count" -gt 0 ]; then + recent_restarts=$((recent_restarts + restart_count)) + fi + done + + if [ "$recent_restarts" -eq 0 ]; then + echo " None (stable week!)" >> "$REPORT_FILE" + fi +} + +# Check database health +check_database() { + add_section "đŸ—„ī¸ Database" + + if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "mana-infra-postgres"; then + echo "âš ī¸ PostgreSQL not running" >> "$REPORT_FILE" + return + fi + + # Get database sizes + local db_sizes + db_sizes=$(docker exec mana-infra-postgres psql -U postgres -t -c " + SELECT datname, pg_size_pretty(pg_database_size(datname)) + FROM pg_database + WHERE datistemplate = false AND datname != 'postgres' + ORDER BY pg_database_size(datname) DESC + LIMIT 5; + " 2>/dev/null | grep -v "^$" || echo "Could not fetch") + + echo "Database sizes:" >> "$REPORT_FILE" + echo "$db_sizes" | while read line; do + [ -n "$line" ] && echo " $line" >> "$REPORT_FILE" + done + + # Get total connection count + local connections + connections=$(docker exec mana-infra-postgres psql -U postgres -t -c " + SELECT count(*) FROM pg_stat_activity WHERE state = 'active'; + " 2>/dev/null | tr -d ' ' || echo "?") + + echo "Active connections: $connections" >> "$REPORT_FILE" +} + +# Check for errors in logs +check_errors() { + add_section "âš ī¸ Recent Errors" + + local error_count=0 + local containers_with_errors="" + + # Check each mana container for errors in the last 24h + for container in $(docker ps --format '{{.Names}}' 2>/dev/null | grep "^mana-"); do + local errors + errors=$(docker logs "$container" --since 168h 2>&1 | grep -ci "error\|exception\|fatal" || echo "0") + + if [ "$errors" -gt 10 ]; then + containers_with_errors="$containers_with_errors\n $container: $errors errors" + error_count=$((error_count + errors)) + fi + done + + if [ -z "$containers_with_errors" ]; then + echo "No significant errors in the last week" >> "$REPORT_FILE" + else + echo "Total errors: $error_count" >> "$REPORT_FILE" + echo -e "$containers_with_errors" >> "$REPORT_FILE" + fi +} + +# Check uptime and system resources +check_system() { + add_section "đŸ–Ĩī¸ System" + + # System uptime + local uptime_str + uptime_str=$(uptime | sed 's/.*up //' | sed 's/,.*//') + echo "Uptime: $uptime_str" >> "$REPORT_FILE" + + # Load average + local load + load=$(uptime | sed 's/.*load averages: //' | awk '{print $1 " " $2 " " $3}') + echo "Load: $load" >> "$REPORT_FILE" + + # Memory (macOS specific) + if command -v vm_stat &> /dev/null; then + local pages_free + pages_free=$(vm_stat | grep "Pages free" | awk '{print $3}' | tr -d '.') + local pages_active + pages_active=$(vm_stat | grep "Pages active" | awk '{print $3}' | tr -d '.') + # Calculate rough memory usage (page size is 16384 on Apple Silicon, 4096 on Intel) + local page_size + page_size=$(pagesize 2>/dev/null || echo "16384") + local mem_free_gb + mem_free_gb=$(echo "scale=1; $pages_free * $page_size / 1024 / 1024 / 1024" | bc 2>/dev/null || echo "?") + echo "Memory free: ~${mem_free_gb}GB" >> "$REPORT_FILE" + fi +} + +# Generate summary +generate_summary() { + echo "" >> "$REPORT_FILE" + echo "━━━━━━━━━━━━━━━━━━━━━━" >> "$REPORT_FILE" + echo "Generated by ManaCore" >> "$REPORT_FILE" +} + +# Main execution +log "=== Generating Weekly Report ===" + +init_report +check_backups +check_disk_usage +check_containers +check_database +check_errors +check_system +generate_summary + +log "Report generated at $REPORT_FILE" + +# Send report via Telegram +REPORT_CONTENT=$(cat "$REPORT_FILE") +send_notification "$REPORT_CONTENT" + +log "Report sent via Telegram" +log "=== Weekly Report Complete ==="