managarten/scripts/mac-mini/check-disk-space.sh

#!/bin/bash
# ManaCore Disk Space Monitor
# Checks disk usage on system and data volumes
# Alerts via Telegram/ntfy when thresholds are exceeded
#
# Thresholds:
#   - Warning: 80%
#   - Critical: 90%
#
# Run via LaunchD hourly

set -e

# Ensure PATH includes required tools
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
LOG_FILE="/tmp/manacore-disk-check.log"

# Thresholds
WARNING_THRESHOLD=80
CRITICAL_THRESHOLD=90

# Load notification config if exists
if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
    source "$PROJECT_ROOT/.env.notifications"
fi

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

send_notification() {
    local message="$1"
    local priority="${2:-default}"

    # Telegram
    if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
        curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
            -d "chat_id=${TELEGRAM_CHAT_ID}" \
            -d "text=${message}" \
            -d "parse_mode=HTML" \
            >/dev/null 2>&1 || true
    fi

    # ntfy
    if [ -n "$NTFY_TOPIC" ]; then
        local ntfy_priority="default"
        [ "$priority" = "high" ] && ntfy_priority="high"
        [ "$priority" = "critical" ] && ntfy_priority="urgent"

        curl -s -d "$message" \
            -H "Title: ManaCore Disk Alert" \
            -H "Priority: $ntfy_priority" \
            -H "Tags: warning" \
            "https://ntfy.sh/$NTFY_TOPIC" >/dev/null 2>&1 || true
    fi
}

check_disk() {
    local mount_point="$1"
    local name="$2"

    # Check if mount point exists
    if [ ! -d "$mount_point" ]; then
        log "WARNING: Mount point $mount_point does not exist"
        return 1
    fi

    # Get disk usage percentage (macOS compatible)
    local usage
    usage=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')

    if [ -z "$usage" ]; then
        log "ERROR: Could not get disk usage for $mount_point"
        return 1
    fi

    # Get available space
    local available
    available=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')

    log "$name: ${usage}% used (${available} free)"

    # Check thresholds
    if [ "$usage" -ge "$CRITICAL_THRESHOLD" ]; then
        log "CRITICAL: $name at ${usage}%!"
        send_notification "🚨 <b>CRITICAL: Disk Space</b>

<b>$name</b> is at <b>${usage}%</b>
Available: ${available}

Immediate action required!" "critical"
        return 2
    elif [ "$usage" -ge "$WARNING_THRESHOLD" ]; then
        log "WARNING: $name at ${usage}%"
        send_notification "⚠️ <b>WARNING: Disk Space</b>

<b>$name</b> is at <b>${usage}%</b>
Available: ${available}

Consider cleaning up old files." "high"
        return 1
    fi

    return 0
}

check_docker_disk() {
    # Check Docker disk usage
    if ! command -v docker &> /dev/null; then
        log "Docker not found in PATH"
        return 0
    fi

    if ! docker info >/dev/null 2>&1; then
        log "Docker is not running"
        return 0
    fi

    # Get Docker disk usage
    local docker_usage
    docker_usage=$(docker system df --format '{{.Size}}' 2>/dev/null | head -1)

    log "Docker disk usage: $docker_usage"

    # Check for dangling images and unused volumes
    local dangling_images
    dangling_images=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')

    local unused_volumes
    unused_volumes=$(docker volume ls -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')

    if [ "$dangling_images" -gt 10 ] || [ "$unused_volumes" -gt 5 ]; then
        log "Docker cleanup recommended: $dangling_images dangling images, $unused_volumes unused volumes"

        # Auto-cleanup if critical
        local system_usage
        system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')

        if [ "$system_usage" -ge "$CRITICAL_THRESHOLD" ]; then
            log "Running docker system prune due to critical disk usage..."
            docker system prune -f --volumes 2>/dev/null || true
            log "Docker cleanup completed"
        fi
    fi
}

check_postgres_backups() {
    local backup_dir="/Volumes/ManaData/backups/postgres"

    if [ ! -d "$backup_dir" ]; then
        return 0
    fi

    # Get backup directory size
    local backup_size
    backup_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}')

    log "PostgreSQL backups: $backup_size"

    # Count old backups (older than 30 days in daily folder)
    local old_backups
    old_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime +30 2>/dev/null | wc -l | tr -d ' ')

    if [ "$old_backups" -gt 0 ]; then
        log "Note: $old_backups old daily backups could be cleaned up"
    fi
}

check_docker_logs() {
    # Check for large Docker log files
    local docker_logs_dir="/var/lib/docker/containers"

    # On macOS with Docker Desktop, logs are in the VM
    # We can check via docker inspect instead
    if ! docker info >/dev/null 2>&1; then
        return 0
    fi

    # Get containers with largest log sizes
    local large_logs=0
    for container in $(docker ps -q 2>/dev/null); do
        local log_size
        log_size=$(docker inspect "$container" --format '{{.LogPath}}' 2>/dev/null | xargs -I {} docker run --rm -v /var/lib/docker:/var/lib/docker:ro alpine stat -c%s {} 2>/dev/null || echo "0")

        # Convert to MB (if size > 100MB, flag it)
        if [ "$log_size" -gt 104857600 ] 2>/dev/null; then
            local container_name
            container_name=$(docker inspect "$container" --format '{{.Name}}' 2>/dev/null | tr -d '/')
            log "Large log file: $container_name ($(($log_size / 1048576))MB)"
            large_logs=$((large_logs + 1))
        fi
    done 2>/dev/null || true

    if [ "$large_logs" -gt 0 ]; then
        log "Found $large_logs containers with large log files"
    fi
}

# Main execution
log "=== ManaCore Disk Space Check ==="

ALERT_STATUS=0

# Check system disk
check_disk "/" "System Disk" || ALERT_STATUS=$?

# Check ManaData volume (external SSD)
if [ -d "/Volumes/ManaData" ]; then
    check_disk "/Volumes/ManaData" "ManaData SSD" || ALERT_STATUS=$?
fi

# Check Docker disk usage
check_docker_disk

# Check backup sizes
check_postgres_backups

# Summary
log "=== Check Complete ==="

if [ "$ALERT_STATUS" -eq 2 ]; then
    log "Status: CRITICAL - Immediate action required"
    exit 2
elif [ "$ALERT_STATUS" -eq 1 ]; then
    log "Status: WARNING - Attention needed"
    exit 1
else
    log "Status: OK - All disks within thresholds"
    exit 0
fi