feat(monitoring): add alerting stack and maintenance scripts

Medium priority stability improvements: Alerting: - Add vmalert for evaluating Prometheus alert rules - Add alertmanager for alert routing and grouping - Add alert-notifier service for Telegram/ntfy notifications - Enable cadvisor scraping in prometheus config Disk Monitoring: - Add check-disk-space.sh for hourly disk monitoring - Alert on 80% (warning) and 90% (critical) thresholds - Auto-cleanup Docker when disk is critical - Add com.manacore.disk-check.plist for LaunchD Weekly Reports: - Add weekly-report.sh for system health summary - Includes: backup status, disk usage, container health, database stats, error log summary - Runs every Sunday at 10 AM via LaunchD Health Check Updates: - Add checks for vmalert, alertmanager, alert-notifier Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-14 20:21:09 +02:00 · 2026-02-12 13:46:57 +01:00 · 2026-02-12 13:46:57 +01:00 · acc8de36ee
commit acc8de36ee
parent 02a5172c7c
11 changed files with 996 additions and 10 deletions
--- a/docker-compose.macmini.yml
+++ b/docker-compose.macmini.yml
@ -1431,6 +1431,11 @@ services:
      - /dev/disk/:/dev/disk:ro
    ports:
      - "9110:8080"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"]
      interval: 30s
      timeout: 10s
      retries: 3
  postgres-exporter:
    image: prometheuscommunity/postgres-exporter:v0.15.0
@ -1481,6 +1486,80 @@ services:
      timeout: 10s
      retries: 3
  # ============================================
  # Alerting Stack (Ports 9093-9095)
  # ============================================
  vmalert:
    image: victoriametrics/vmalert:v1.99.0
    container_name: mana-mon-vmalert
    restart: always
    depends_on:
      victoriametrics:
        condition: service_healthy
      alertmanager:
        condition: service_healthy
    command:
      - '-datasource.url=http://victoriametrics:9090'
      - '-notifier.url=http://alertmanager:9093'
      - '-remoteWrite.url=http://victoriametrics:9090'
      - '-remoteRead.url=http://victoriametrics:9090'
      - '-rule=/etc/alerts/*.yml'
      - '-evaluationInterval=30s'
      - '-httpListenAddr=:8880'
    volumes:
      - ./docker/prometheus/alerts.yml:/etc/alerts/alerts.yml:ro
    ports:
      - "8880:8880"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8880/health"]
      interval: 30s
      timeout: 10s
      retries: 3
  alertmanager:
    image: prom/alertmanager:v0.27.0
    container_name: mana-mon-alertmanager
    restart: always
    depends_on:
      alert-notifier:
        condition: service_healthy
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.listen-address=:9093'
    volumes:
      - ./docker/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
      - alertmanager_data:/alertmanager
    ports:
      - "9093:9093"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9093/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
  alert-notifier:
    build:
      context: ./docker/alert-notifier
      dockerfile: Dockerfile
    image: alert-notifier:local
    container_name: mana-mon-alert-notifier
    restart: always
    environment:
      PORT: 8080
      TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
      TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID:-}
      NTFY_TOPIC: ${NTFY_TOPIC:-}
    ports:
      - "9095:8080"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 5s
  # ============================================
  # Auto-Update (Watchtower)
  # ============================================
@ -1508,6 +1587,8 @@ volumes:
    name: mana-redis-data
  victoriametrics_data:
    name: mana-victoria-data
  alertmanager_data:
    name: mana-alertmanager-data
  grafana_data:
    name: mana-grafana-data
  analytics_data:
--- a/docker/alert-notifier/Dockerfile
+++ b/docker/alert-notifier/Dockerfile
@ -0,0 +1,17 @@
 FROM python:3.12-alpine
 WORKDIR /app
 COPY main.py .
 # No dependencies needed - uses only stdlib
 RUN chmod +x main.py
 EXPOSE 8080
 ENV PORT=8080
 HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
    CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:8080/health || exit 1
 CMD ["python", "main.py"]
--- a/docker/alert-notifier/main.py
+++ b/docker/alert-notifier/main.py
@ -0,0 +1,204 @@
 #!/usr/bin/env python3
 """
 Alert Notifier - Webhook receiver for Alertmanager
 Forwards alerts to Telegram and ntfy
 Environment Variables:
  TELEGRAM_BOT_TOKEN - Telegram bot token
  TELEGRAM_CHAT_ID - Telegram chat ID
  NTFY_TOPIC - ntfy.sh topic name (optional)
 """
 import os
 import json
 import logging
 from http.server import HTTPServer, BaseHTTPRequestHandler
 import urllib.request
 import urllib.parse
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 TELEGRAM_BOT_TOKEN = os.environ.get('TELEGRAM_BOT_TOKEN', '')
 TELEGRAM_CHAT_ID = os.environ.get('TELEGRAM_CHAT_ID', '')
 NTFY_TOPIC = os.environ.get('NTFY_TOPIC', '')
 SEVERITY_EMOJI = {
    'critical': '🚨',
    'warning': '⚠️',
    'info': 'ℹ️',
 }
 def format_alert_telegram(alert: dict, status: str) -> str:
    """Format a single alert for Telegram."""
    labels = alert.get('labels', {})
    annotations = alert.get('annotations', {})
    severity = labels.get('severity', 'unknown')
    emoji = SEVERITY_EMOJI.get(severity, '🔔')
    if status == 'resolved':
        emoji = '✅'
    alertname = labels.get('alertname', 'Unknown')
    job = labels.get('job', '')
    summary = annotations.get('summary', alertname)
    description = annotations.get('description', '')
    msg = f"{emoji} <b>{status.upper()}: {summary}</b>\n"
    if job:
        msg += f"Service: <code>{job}</code>\n"
    if description:
        msg += f"{description}\n"
    return msg
 def send_telegram(message: str) -> bool:
    """Send message to Telegram."""
    if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
        logger.warning("Telegram not configured")
        return False
    url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
    data = {
        'chat_id': TELEGRAM_CHAT_ID,
        'text': message,
        'parse_mode': 'HTML',
        'disable_web_page_preview': True
    }
    try:
        req = urllib.request.Request(
            url,
            data=urllib.parse.urlencode(data).encode(),
            headers={'Content-Type': 'application/x-www-form-urlencoded'}
        )
        with urllib.request.urlopen(req, timeout=10) as resp:
            return resp.status == 200
    except Exception as e:
        logger.error(f"Telegram send failed: {e}")
        return False
 def send_ntfy(title: str, message: str, priority: str = 'default') -> bool:
    """Send message to ntfy."""
    if not NTFY_TOPIC:
        return False
    url = f"https://ntfy.sh/{NTFY_TOPIC}"
    priority_map = {
        'critical': 'urgent',
        'warning': 'high',
        'info': 'low'
    }
    ntfy_priority = priority_map.get(priority, 'default')
    try:
        req = urllib.request.Request(
            url,
            data=message.encode('utf-8'),
            headers={
                'Title': title,
                'Priority': ntfy_priority,
                'Tags': 'warning' if priority == 'critical' else 'loudspeaker'
            }
        )
        with urllib.request.urlopen(req, timeout=10) as resp:
            return resp.status == 200
    except Exception as e:
        logger.error(f"ntfy send failed: {e}")
        return False
 class AlertHandler(BaseHTTPRequestHandler):
    def do_POST(self):
        if self.path != '/webhook':
            self.send_response(404)
            self.end_headers()
            return
        content_length = int(self.headers.get('Content-Length', 0))
        body = self.rfile.read(content_length)
        try:
            payload = json.loads(body)
            self.process_alerts(payload)
            self.send_response(200)
            self.end_headers()
            self.wfile.write(b'OK')
        except Exception as e:
            logger.error(f"Error processing webhook: {e}")
            self.send_response(500)
            self.end_headers()
            self.wfile.write(str(e).encode())
    def do_GET(self):
        if self.path == '/health':
            self.send_response(200)
            self.end_headers()
            self.wfile.write(b'OK')
        else:
            self.send_response(404)
            self.end_headers()
    def process_alerts(self, payload: dict):
        """Process Alertmanager webhook payload."""
        status = payload.get('status', 'unknown')
        alerts = payload.get('alerts', [])
        if not alerts:
            return
        logger.info(f"Received {len(alerts)} alerts with status: {status}")
        # Build message
        messages = []
        highest_severity = 'info'
        for alert in alerts:
            msg = format_alert_telegram(alert, alert.get('status', status))
            messages.append(msg)
            severity = alert.get('labels', {}).get('severity', 'info')
            if severity == 'critical':
                highest_severity = 'critical'
            elif severity == 'warning' and highest_severity != 'critical':
                highest_severity = 'warning'
        combined_message = '\n'.join(messages)
        # Send notifications
        if TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID:
            success = send_telegram(combined_message)
            logger.info(f"Telegram: {'sent' if success else 'failed'}")
        if NTFY_TOPIC:
            title = f"ManaCore Alert ({len(alerts)} alerts)"
            # Strip HTML for ntfy
            plain_message = combined_message.replace('<b>', '').replace('</b>', '')
            plain_message = plain_message.replace('<code>', '').replace('</code>', '')
            success = send_ntfy(title, plain_message, highest_severity)
            logger.info(f"ntfy: {'sent' if success else 'failed'}")
    def log_message(self, format, *args):
        logger.info(f"{self.client_address[0]} - {format % args}")
 def main():
    port = int(os.environ.get('PORT', 8080))
    logger.info(f"Starting Alert Notifier on port {port}")
    logger.info(f"Telegram configured: {bool(TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID)}")
    logger.info(f"ntfy configured: {bool(NTFY_TOPIC)}")
    server = HTTPServer(('0.0.0.0', port), AlertHandler)
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        logger.info("Shutting down")
        server.shutdown()
 if __name__ == '__main__':
    main()
--- a/docker/alertmanager/alertmanager.yml
+++ b/docker/alertmanager/alertmanager.yml
@ -0,0 +1,62 @@
 # Alertmanager Configuration for ManaCore
 # Sends alerts via webhook to custom notification handler
 global:
  resolve_timeout: 5m
 route:
  # Default receiver for all alerts
  receiver: 'webhook'
  # Group alerts by severity and service
  group_by: ['alertname', 'severity', 'job']
  # Wait before sending first notification
  group_wait: 30s
  # Wait before sending follow-up notifications for same group
  group_interval: 5m
  # Wait before re-sending resolved alerts
  repeat_interval: 4h
  routes:
    # Critical alerts - immediate notification
    - match:
        severity: critical
      receiver: 'webhook'
      group_wait: 10s
      repeat_interval: 1h
    # Warning alerts - less frequent
    - match:
        severity: warning
      receiver: 'webhook'
      group_wait: 1m
      repeat_interval: 6h
    # Info alerts - only during business hours, batch together
    - match:
        severity: info
      receiver: 'webhook'
      group_wait: 5m
      repeat_interval: 24h
 receivers:
  - name: 'webhook'
    webhook_configs:
      - url: 'http://alert-notifier:8080/webhook'
        send_resolved: true
        max_alerts: 10
 # Inhibition rules - prevent redundant alerts
 inhibit_rules:
  # Don't alert on warnings if critical is firing for same service
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'job']
  # Don't alert on service-specific issues if PostgreSQL is down
  - source_match:
      alertname: 'PostgreSQLDown'
    target_match_re:
      alertname: '.*(Backend|Service).*'
    equal: []
--- a/docker/prometheus/prometheus.yml
+++ b/docker/prometheus/prometheus.yml
@ -9,11 +9,11 @@ global:
 rule_files:
  - /etc/prometheus/alerts.yml
-# Alertmanager configuration (optional, for future use)
+# Alertmanager configuration
-# alerting:
+alerting:
-#   alertmanagers:
+  alertmanagers:
-#     - static_configs:
+    - static_configs:
-#         - targets: []
+        - targets: ['alertmanager:9093']
 scrape_configs:
  # Prometheus self-monitoring
@ -30,10 +30,10 @@ scrape_configs:
        target_label: instance
        replacement: 'mac-mini'
-  # Docker container metrics via cAdvisor (disabled - container not deployed)
+  # Docker container metrics via cAdvisor
-  # - job_name: 'cadvisor'
+  - job_name: 'cadvisor'
-  #   static_configs:
+    static_configs:
-  #     - targets: ['cadvisor:8080']
+      - targets: ['cadvisor:8080']
  # PostgreSQL metrics
  - job_name: 'postgres'
--- a/scripts/mac-mini/check-disk-space.sh
+++ b/scripts/mac-mini/check-disk-space.sh
@ -0,0 +1,233 @@
 #!/bin/bash
 # ManaCore Disk Space Monitor
 # Checks disk usage on system and data volumes
 # Alerts via Telegram/ntfy when thresholds are exceeded
 #
 # Thresholds:
 #   - Warning: 80%
 #   - Critical: 90%
 #
 # Run via LaunchD hourly
 set -e
 # Ensure PATH includes required tools
 export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 LOG_FILE="/tmp/manacore-disk-check.log"
 # Thresholds
 WARNING_THRESHOLD=80
 CRITICAL_THRESHOLD=90
 # Load notification config if exists
 if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
    source "$PROJECT_ROOT/.env.notifications"
 fi
 log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
 }
 send_notification() {
    local message="$1"
    local priority="${2:-default}"
    # Telegram
    if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
        curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
            -d "chat_id=${TELEGRAM_CHAT_ID}" \
            -d "text=${message}" \
            -d "parse_mode=HTML" \
            >/dev/null 2>&1 || true
    fi
    # ntfy
    if [ -n "$NTFY_TOPIC" ]; then
        local ntfy_priority="default"
        [ "$priority" = "high" ] && ntfy_priority="high"
        [ "$priority" = "critical" ] && ntfy_priority="urgent"
        curl -s -d "$message" \
            -H "Title: ManaCore Disk Alert" \
            -H "Priority: $ntfy_priority" \
            -H "Tags: warning" \
            "https://ntfy.sh/$NTFY_TOPIC" >/dev/null 2>&1 || true
    fi
 }
 check_disk() {
    local mount_point="$1"
    local name="$2"
    # Check if mount point exists
    if [ ! -d "$mount_point" ]; then
        log "WARNING: Mount point $mount_point does not exist"
        return 1
    fi
    # Get disk usage percentage (macOS compatible)
    local usage
    usage=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')
    if [ -z "$usage" ]; then
        log "ERROR: Could not get disk usage for $mount_point"
        return 1
    fi
    # Get available space
    local available
    available=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')
    log "$name: ${usage}% used (${available} free)"
    # Check thresholds
    if [ "$usage" -ge "$CRITICAL_THRESHOLD" ]; then
        log "CRITICAL: $name at ${usage}%!"
        send_notification "🚨 <b>CRITICAL: Disk Space</b>
 <b>$name</b> is at <b>${usage}%</b>
 Available: ${available}
 Immediate action required!" "critical"
        return 2
    elif [ "$usage" -ge "$WARNING_THRESHOLD" ]; then
        log "WARNING: $name at ${usage}%"
        send_notification "⚠️ <b>WARNING: Disk Space</b>
 <b>$name</b> is at <b>${usage}%</b>
 Available: ${available}
 Consider cleaning up old files." "high"
        return 1
    fi
    return 0
 }
 check_docker_disk() {
    # Check Docker disk usage
    if ! command -v docker &> /dev/null; then
        log "Docker not found in PATH"
        return 0
    fi
    if ! docker info >/dev/null 2>&1; then
        log "Docker is not running"
        return 0
    fi
    # Get Docker disk usage
    local docker_usage
    docker_usage=$(docker system df --format '{{.Size}}' 2>/dev/null | head -1)
    log "Docker disk usage: $docker_usage"
    # Check for dangling images and unused volumes
    local dangling_images
    dangling_images=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')
    local unused_volumes
    unused_volumes=$(docker volume ls -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')
    if [ "$dangling_images" -gt 10 ] || [ "$unused_volumes" -gt 5 ]; then
        log "Docker cleanup recommended: $dangling_images dangling images, $unused_volumes unused volumes"
        # Auto-cleanup if critical
        local system_usage
        system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')
        if [ "$system_usage" -ge "$CRITICAL_THRESHOLD" ]; then
            log "Running docker system prune due to critical disk usage..."
            docker system prune -f --volumes 2>/dev/null || true
            log "Docker cleanup completed"
        fi
    fi
 }
 check_postgres_backups() {
    local backup_dir="/Volumes/ManaData/backups/postgres"
    if [ ! -d "$backup_dir" ]; then
        return 0
    fi
    # Get backup directory size
    local backup_size
    backup_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}')
    log "PostgreSQL backups: $backup_size"
    # Count old backups (older than 30 days in daily folder)
    local old_backups
    old_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime +30 2>/dev/null | wc -l | tr -d ' ')
    if [ "$old_backups" -gt 0 ]; then
        log "Note: $old_backups old daily backups could be cleaned up"
    fi
 }
 check_docker_logs() {
    # Check for large Docker log files
    local docker_logs_dir="/var/lib/docker/containers"
    # On macOS with Docker Desktop, logs are in the VM
    # We can check via docker inspect instead
    if ! docker info >/dev/null 2>&1; then
        return 0
    fi
    # Get containers with largest log sizes
    local large_logs=0
    for container in $(docker ps -q 2>/dev/null); do
        local log_size
        log_size=$(docker inspect "$container" --format '{{.LogPath}}' 2>/dev/null | xargs -I {} docker run --rm -v /var/lib/docker:/var/lib/docker:ro alpine stat -c%s {} 2>/dev/null || echo "0")
        # Convert to MB (if size > 100MB, flag it)
        if [ "$log_size" -gt 104857600 ] 2>/dev/null; then
            local container_name
            container_name=$(docker inspect "$container" --format '{{.Name}}' 2>/dev/null | tr -d '/')
            log "Large log file: $container_name ($(($log_size / 1048576))MB)"
            large_logs=$((large_logs + 1))
        fi
    done 2>/dev/null || true
    if [ "$large_logs" -gt 0 ]; then
        log "Found $large_logs containers with large log files"
    fi
 }
 # Main execution
 log "=== ManaCore Disk Space Check ==="
 ALERT_STATUS=0
 # Check system disk
 check_disk "/" "System Disk" || ALERT_STATUS=$?
 # Check ManaData volume (external SSD)
 if [ -d "/Volumes/ManaData" ]; then
    check_disk "/Volumes/ManaData" "ManaData SSD" || ALERT_STATUS=$?
 fi
 # Check Docker disk usage
 check_docker_disk
 # Check backup sizes
 check_postgres_backups
 # Summary
 log "=== Check Complete ==="
 if [ "$ALERT_STATUS" -eq 2 ]; then
    log "Status: CRITICAL - Immediate action required"
    exit 2
 elif [ "$ALERT_STATUS" -eq 1 ]; then
    log "Status: WARNING - Attention needed"
    exit 1
 else
    log "Status: OK - All disks within thresholds"
    exit 0
 fi
--- a/scripts/mac-mini/health-check.sh
+++ b/scripts/mac-mini/health-check.sh
@ -275,6 +275,12 @@ check_service "Grafana" "http://localhost:8000/api/health"
 check_service "Umami" "http://localhost:8010/api/heartbeat"
 check_service "VictoriaMetrics" "http://localhost:9090/health"
 echo ""
 echo "Alerting:"
 check_service "vmalert" "http://localhost:8880/health"
 check_service "Alertmanager" "http://localhost:9093/-/healthy"
 check_service "Alert Notifier" "http://localhost:9095/health"
 echo ""
 echo "Cloudflare Tunnel:"
 if pgrep -x "cloudflared" >/dev/null; then
--- a/scripts/mac-mini/launchd/README.md
+++ b/scripts/mac-mini/launchd/README.md
@ -17,8 +17,11 @@ for f in *.plist; do launchctl load ~/Library/LaunchAgents/$f; done
 | Service | Description | Interval |
 |---------|-------------|----------|
 | `docker-startup` | Starts Docker containers on boot | At login |
-| `ensure-containers` | Detects and restarts stuck containers | Every 5 min |
+| `ensure-containers` | Detects and restarts stuck/crash-looping containers | Every 5 min |
 | `health-check` | Checks all services and sends alerts | Every 5 min |
 | `backup-databases` | PostgreSQL backup with daily/weekly rotation | Daily 3 AM |
 | `disk-check` | Monitors disk space, alerts on thresholds | Hourly |
 | `weekly-report` | Generates system health summary | Sunday 10 AM |
 | `ssd-check` | Monitors SSD health | Periodic |
 | `mana-stt` | Speech-to-text service (Whisper) | At login |
 | `mana-tts` | Text-to-speech service (Kokoro) | At login |
--- a/scripts/mac-mini/launchd/com.manacore.disk-check.plist
+++ b/scripts/mac-mini/launchd/com.manacore.disk-check.plist
@ -0,0 +1,34 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
    <key>Label</key>
    <string>com.manacore.disk-check</string>
    <key>ProgramArguments</key>
    <array>
        <string>/bin/bash</string>
        <string>/Users/mana/projects/manacore-monorepo/scripts/mac-mini/check-disk-space.sh</string>
    </array>
    <!-- Run hourly -->
    <key>StartInterval</key>
    <integer>3600</integer>
    <!-- Also run at startup -->
    <key>RunAtLoad</key>
    <true/>
    <key>StandardOutPath</key>
    <string>/tmp/manacore-disk-check.log</string>
    <key>StandardErrorPath</key>
    <string>/tmp/manacore-disk-check.error.log</string>
    <key>EnvironmentVariables</key>
    <dict>
        <key>PATH</key>
        <string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin</string>
    </dict>
 </dict>
 </plist>
--- a/scripts/mac-mini/launchd/com.manacore.weekly-report.plist
+++ b/scripts/mac-mini/launchd/com.manacore.weekly-report.plist
@ -0,0 +1,37 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
    <key>Label</key>
    <string>com.manacore.weekly-report</string>
    <key>ProgramArguments</key>
    <array>
        <string>/bin/bash</string>
        <string>/Users/mana/projects/manacore-monorepo/scripts/mac-mini/weekly-report.sh</string>
    </array>
    <!-- Run every Sunday at 10:00 AM -->
    <key>StartCalendarInterval</key>
    <dict>
        <key>Weekday</key>
        <integer>0</integer>
        <key>Hour</key>
        <integer>10</integer>
        <key>Minute</key>
        <integer>0</integer>
    </dict>
    <key>StandardOutPath</key>
    <string>/tmp/manacore-weekly-report.log</string>
    <key>StandardErrorPath</key>
    <string>/tmp/manacore-weekly-report.error.log</string>
    <key>EnvironmentVariables</key>
    <dict>
        <key>PATH</key>
        <string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin</string>
    </dict>
 </dict>
 </plist>
--- a/scripts/mac-mini/weekly-report.sh
+++ b/scripts/mac-mini/weekly-report.sh
@ -0,0 +1,309 @@
 #!/bin/bash
 # ManaCore Weekly Maintenance Report
 # Generates a comprehensive system health summary
 #
 # Includes:
 #   - Backup status
 #   - Disk usage
 #   - Container health & restart counts
 #   - Database statistics
 #   - Error log summary
 #
 # Run via LaunchD every Sunday at 10:00 AM
 set -e
 # Ensure PATH includes required tools
 export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 LOG_FILE="/tmp/manacore-weekly-report.log"
 REPORT_FILE="/tmp/manacore-weekly-report.txt"
 # Load notification config if exists
 if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
    source "$PROJECT_ROOT/.env.notifications"
 fi
 # Load env for database password
 if [ -f "$PROJECT_ROOT/.env.macmini" ]; then
    source "$PROJECT_ROOT/.env.macmini"
 fi
 log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
 }
 send_notification() {
    local message="$1"
    # Telegram
    if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
        curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
            -d "chat_id=${TELEGRAM_CHAT_ID}" \
            -d "text=${message}" \
            -d "parse_mode=HTML" \
            >/dev/null 2>&1 || true
    fi
 }
 # Initialize report
 init_report() {
    cat > "$REPORT_FILE" << EOF
 📊 <b>ManaCore Weekly Report</b>
 $(date '+%Y-%m-%d %H:%M')
 ━━━━━━━━━━━━━━━━━━━━━━
 EOF
 }
 # Add section to report
 add_section() {
    local title="$1"
    echo "" >> "$REPORT_FILE"
    echo "<b>$title</b>" >> "$REPORT_FILE"
 }
 # Check backup status
 check_backups() {
    add_section "💾 Backup Status"
    local backup_dir="/Volumes/ManaData/backups/postgres"
    if [ ! -d "$backup_dir" ]; then
        echo "⚠️ Backup directory not found" >> "$REPORT_FILE"
        return
    fi
    # Count recent backups
    local daily_count
    daily_count=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime -7 2>/dev/null | wc -l | tr -d ' ')
    local weekly_count
    weekly_count=$(find "$backup_dir/weekly" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
    # Get latest backup
    local latest_backup
    latest_backup=$(ls -t "$backup_dir/daily"/*.sql.gz 2>/dev/null | head -1)
    local latest_date=""
    if [ -n "$latest_backup" ]; then
        latest_date=$(stat -f "%Sm" -t "%Y-%m-%d %H:%M" "$latest_backup" 2>/dev/null || stat -c "%y" "$latest_backup" 2>/dev/null | cut -d'.' -f1)
    fi
    # Get total backup size
    local total_size
    total_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}')
    echo "Daily backups (7 days): $daily_count" >> "$REPORT_FILE"
    echo "Weekly backups: $weekly_count" >> "$REPORT_FILE"
    echo "Latest: $latest_date" >> "$REPORT_FILE"
    echo "Total size: $total_size" >> "$REPORT_FILE"
    # Verify backup integrity (check if files are not empty)
    local empty_backups
    empty_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -size 0 2>/dev/null | wc -l | tr -d ' ')
    if [ "$empty_backups" -gt 0 ]; then
        echo "⚠️ $empty_backups empty backup files found!" >> "$REPORT_FILE"
    fi
 }
 # Check disk usage
 check_disk_usage() {
    add_section "💿 Disk Usage"
    # System disk
    local system_usage
    system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}')
    echo "System: $system_usage" >> "$REPORT_FILE"
    # ManaData SSD
    if [ -d "/Volumes/ManaData" ]; then
        local data_usage
        data_usage=$(df -h "/Volumes/ManaData" 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}')
        echo "ManaData: $data_usage" >> "$REPORT_FILE"
    fi
    # Docker disk usage
    if docker info >/dev/null 2>&1; then
        local docker_images
        docker_images=$(docker system df --format '{{.Type}}: {{.Size}}' 2>/dev/null | head -3 | tr '\n' ', ' | sed 's/,$//')
        echo "Docker: $docker_images" >> "$REPORT_FILE"
    fi
 }
 # Check container health
 check_containers() {
    add_section "🐳 Container Health"
    if ! docker info >/dev/null 2>&1; then
        echo "⚠️ Docker not running" >> "$REPORT_FILE"
        return
    fi
    # Count containers by status
    local running
    running=$(docker ps -q 2>/dev/null | wc -l | tr -d ' ')
    local total
    total=$(docker ps -aq 2>/dev/null | wc -l | tr -d ' ')
    local healthy
    healthy=$(docker ps --filter "health=healthy" -q 2>/dev/null | wc -l | tr -d ' ')
    local unhealthy
    unhealthy=$(docker ps --filter "health=unhealthy" -q 2>/dev/null | wc -l | tr -d ' ')
    echo "Running: $running / $total" >> "$REPORT_FILE"
    echo "Healthy: $healthy" >> "$REPORT_FILE"
    if [ "$unhealthy" -gt 0 ]; then
        echo "⚠️ Unhealthy: $unhealthy" >> "$REPORT_FILE"
        # List unhealthy containers
        docker ps --filter "health=unhealthy" --format "  - {{.Names}}" 2>/dev/null >> "$REPORT_FILE"
    fi
    # Get containers with most restarts this week
    echo "" >> "$REPORT_FILE"
    echo "Top restarts:" >> "$REPORT_FILE"
    docker ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null | \
        grep -E "mana-" | \
        while read name status; do
            local restarts
            restarts=$(docker inspect "$name" --format '{{.RestartCount}}' 2>/dev/null || echo "0")
            if [ "$restarts" -gt 0 ]; then
                echo "  $name: $restarts" >> "$REPORT_FILE"
            fi
        done
    # Check for any container that restarted in the last week
    local recent_restarts=0
    for container in $(docker ps -aq 2>/dev/null); do
        local restart_count
        restart_count=$(docker inspect "$container" --format '{{.RestartCount}}' 2>/dev/null || echo "0")
        if [ "$restart_count" -gt 0 ]; then
            recent_restarts=$((recent_restarts + restart_count))
        fi
    done
    if [ "$recent_restarts" -eq 0 ]; then
        echo "  None (stable week!)" >> "$REPORT_FILE"
    fi
 }
 # Check database health
 check_database() {
    add_section "🗄️ Database"
    if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "mana-infra-postgres"; then
        echo "⚠️ PostgreSQL not running" >> "$REPORT_FILE"
        return
    fi
    # Get database sizes
    local db_sizes
    db_sizes=$(docker exec mana-infra-postgres psql -U postgres -t -c "
        SELECT datname, pg_size_pretty(pg_database_size(datname))
        FROM pg_database
        WHERE datistemplate = false AND datname != 'postgres'
        ORDER BY pg_database_size(datname) DESC
        LIMIT 5;
    " 2>/dev/null | grep -v "^$" || echo "Could not fetch")
    echo "Database sizes:" >> "$REPORT_FILE"
    echo "$db_sizes" | while read line; do
        [ -n "$line" ] && echo "  $line" >> "$REPORT_FILE"
    done
    # Get total connection count
    local connections
    connections=$(docker exec mana-infra-postgres psql -U postgres -t -c "
        SELECT count(*) FROM pg_stat_activity WHERE state = 'active';
    " 2>/dev/null | tr -d ' ' || echo "?")
    echo "Active connections: $connections" >> "$REPORT_FILE"
 }
 # Check for errors in logs
 check_errors() {
    add_section "⚠️ Recent Errors"
    local error_count=0
    local containers_with_errors=""
    # Check each mana container for errors in the last 24h
    for container in $(docker ps --format '{{.Names}}' 2>/dev/null | grep "^mana-"); do
        local errors
        errors=$(docker logs "$container" --since 168h 2>&1 | grep -ci "error\|exception\|fatal" || echo "0")
        if [ "$errors" -gt 10 ]; then
            containers_with_errors="$containers_with_errors\n  $container: $errors errors"
            error_count=$((error_count + errors))
        fi
    done
    if [ -z "$containers_with_errors" ]; then
        echo "No significant errors in the last week" >> "$REPORT_FILE"
    else
        echo "Total errors: $error_count" >> "$REPORT_FILE"
        echo -e "$containers_with_errors" >> "$REPORT_FILE"
    fi
 }
 # Check uptime and system resources
 check_system() {
    add_section "🖥️ System"
    # System uptime
    local uptime_str
    uptime_str=$(uptime | sed 's/.*up //' | sed 's/,.*//')
    echo "Uptime: $uptime_str" >> "$REPORT_FILE"
    # Load average
    local load
    load=$(uptime | sed 's/.*load averages: //' | awk '{print $1 " " $2 " " $3}')
    echo "Load: $load" >> "$REPORT_FILE"
    # Memory (macOS specific)
    if command -v vm_stat &> /dev/null; then
        local pages_free
        pages_free=$(vm_stat | grep "Pages free" | awk '{print $3}' | tr -d '.')
        local pages_active
        pages_active=$(vm_stat | grep "Pages active" | awk '{print $3}' | tr -d '.')
        # Calculate rough memory usage (page size is 16384 on Apple Silicon, 4096 on Intel)
        local page_size
        page_size=$(pagesize 2>/dev/null || echo "16384")
        local mem_free_gb
        mem_free_gb=$(echo "scale=1; $pages_free * $page_size / 1024 / 1024 / 1024" | bc 2>/dev/null || echo "?")
        echo "Memory free: ~${mem_free_gb}GB" >> "$REPORT_FILE"
    fi
 }
 # Generate summary
 generate_summary() {
    echo "" >> "$REPORT_FILE"
    echo "━━━━━━━━━━━━━━━━━━━━━━" >> "$REPORT_FILE"
    echo "<i>Generated by ManaCore</i>" >> "$REPORT_FILE"
 }
 # Main execution
 log "=== Generating Weekly Report ==="
 init_report
 check_backups
 check_disk_usage
 check_containers
 check_database
 check_errors
 check_system
 generate_summary
 log "Report generated at $REPORT_FILE"
 # Send report via Telegram
 REPORT_CONTENT=$(cat "$REPORT_FILE")
 send_notification "$REPORT_CONTENT"
 log "Report sent via Telegram"
 log "=== Weekly Report Complete ==="