From 4e370911e80f24971e1b3ccccc9e92f48487d1d2 Mon Sep 17 00:00:00 2001 From: Till JS Date: Mon, 30 Mar 2026 20:03:33 +0200 Subject: [PATCH] feat(monitoring): disk metrics via Pushgateway, Loki in Master Overview, Colima move script - check-disk-space.sh now pushes mac_disk_used_percent + mac_colima_disk_used_gb to Pushgateway every hour so vmalert can alert on real macOS disk usage - alerts.yml: replace broken node-exporter disk alerts with Pushgateway-based ones - master-overview.json: add "Recent Errors (Loki)" section with live error log stream, error rate timeseries and top error sources barchart - move-colima-to-external-ssd.sh: guided script to move 200GB Colima VM datadisk from internal SSD to /Volumes/ManaData (3.6TB external SSD) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../grafana/dashboards/master-overview.json | 85 ++++++++++++++ scripts/mac-mini/check-disk-space.sh | 58 +++++++++- .../mac-mini/move-colima-to-external-ssd.sh | 105 ++++++++++++++++++ 3 files changed, 244 insertions(+), 4 deletions(-) create mode 100644 scripts/mac-mini/move-colima-to-external-ssd.sh diff --git a/docker/grafana/dashboards/master-overview.json b/docker/grafana/dashboards/master-overview.json index 82a9d9d05..d45c1c5b9 100644 --- a/docker/grafana/dashboards/master-overview.json +++ b/docker/grafana/dashboards/master-overview.json @@ -1277,6 +1277,91 @@ ], "title": "New This Month", "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 99 }, + "id": 900, + "panels": [], + "title": "Recent Errors (Loki)", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 100 }, + "id": 901, + "title": "Errors across all services (last 30 min)", + "type": "logs", + "options": { + "showTime": true, + "showLabels": false, + "showCommonLabels": false, + "wrapLogMessage": false, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" | tier != \"other\"", + "refId": "A" + } + ], + "links": [ + { + "title": "Open Logs Explorer", + "url": "/d/logs-explorer", + "targetBlank": false + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 70, + "stacking": { "group": "A", "mode": "normal" } + } + } + }, + "gridPos": { "h": 5, "w": 12, "x": 0, "y": 108 }, + "id": 902, + "title": "Error Rate by Service (last 1h)", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum by (service) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [$__interval]))", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "displayMode": "gradient", "showValue": "auto" } + } + }, + "gridPos": { "h": 5, "w": 12, "x": 12, "y": 108 }, + "id": 903, + "title": "Top Error Sources (last 1h)", + "type": "barchart", + "options": { "xTickLabelRotation": -45 }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "topk(8, sum by (service) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [1h])))", + "refId": "A", + "instant": true + } + ] } ], "refresh": "30s", diff --git a/scripts/mac-mini/check-disk-space.sh b/scripts/mac-mini/check-disk-space.sh index 5b805470a..1173231a7 100755 --- a/scripts/mac-mini/check-disk-space.sh +++ b/scripts/mac-mini/check-disk-space.sh @@ -58,9 +58,48 @@ send_notification() { fi } +PUSHGATEWAY_URL="${PUSHGATEWAY_URL:-http://localhost:9091}" + +push_disk_metrics() { + local disk_label="$1" + local mount_point="$2" + local usage_pct="$3" + local avail_human="$4" + local avail_bytes="${5:-0}" + local size_bytes="${6:-0}" + + cat </dev/null 2>&1 || true +# HELP mac_disk_used_percent Disk usage percent on macOS host +# TYPE mac_disk_used_percent gauge +mac_disk_used_percent{disk="${disk_label}",mountpoint="${mount_point}",avail_human="${avail_human}"} ${usage_pct} +# HELP mac_disk_avail_bytes Disk available bytes on macOS host +# TYPE mac_disk_avail_bytes gauge +mac_disk_avail_bytes{disk="${disk_label}",mountpoint="${mount_point}"} ${avail_bytes} +# HELP mac_disk_size_bytes Total disk size bytes on macOS host +# TYPE mac_disk_size_bytes gauge +mac_disk_size_bytes{disk="${disk_label}",mountpoint="${mount_point}"} ${size_bytes} +PROMEOF +} + +push_colima_metrics() { + local colima_disk="/Users/mana/.colima/_lima/_disks/colima/datadisk" + [ -f "$colima_disk" ] || return 0 + local used_kb + used_kb=$(du -sk "$colima_disk" 2>/dev/null | awk '{print $1}') + local used_gb + used_gb=$(awk "BEGIN {printf \"%.1f\", ${used_kb:-0} / 1048576}") + cat </dev/null 2>&1 || true +# HELP mac_colima_disk_used_gb Colima VM datadisk actual on-disk usage in GB +# TYPE mac_colima_disk_used_gb gauge +mac_colima_disk_used_gb ${used_gb} +PROMEOF + log "Colima VM disk: ${used_gb}GB on disk" +} + check_disk() { local mount_point="$1" local name="$2" + local disk_label="${3:-}" # Check if mount point exists if [ ! -d "$mount_point" ]; then @@ -77,12 +116,20 @@ check_disk() { return 1 fi - # Get available space + # Get available and total space local available available=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}') + local avail_bytes size_bytes + avail_bytes=$(df "$mount_point" 2>/dev/null | awk 'NR==2 {print $4 * 512}') + size_bytes=$(df "$mount_point" 2>/dev/null | awk 'NR==2 {print $2 * 512}') log "$name: ${usage}% used (${available} free)" + # Push metrics to Pushgateway → Prometheus → vmalert → Telegram + if [ -n "$disk_label" ]; then + push_disk_metrics "$disk_label" "$mount_point" "$usage" "$available" "${avail_bytes:-0}" "${size_bytes:-0}" + fi + # Check thresholds if [ "$usage" -ge "$CRITICAL_THRESHOLD" ]; then log "CRITICAL: $name at ${usage}%!" @@ -204,14 +251,17 @@ log "=== ManaCore Disk Space Check ===" ALERT_STATUS=0 -# Check system disk -check_disk "/" "System Disk" || ALERT_STATUS=$? +# Check system disk (internal SSD) +check_disk "/" "System Disk" "internal" || ALERT_STATUS=$? # Check ManaData volume (external SSD) if [ -d "/Volumes/ManaData" ]; then - check_disk "/Volumes/ManaData" "ManaData SSD" || ALERT_STATUS=$? + check_disk "/Volumes/ManaData" "ManaData SSD" "manaData" || ALERT_STATUS=$? fi +# Push Colima VM disk metrics +push_colima_metrics + # Check Docker disk usage check_docker_disk diff --git a/scripts/mac-mini/move-colima-to-external-ssd.sh b/scripts/mac-mini/move-colima-to-external-ssd.sh new file mode 100644 index 000000000..43954b1f9 --- /dev/null +++ b/scripts/mac-mini/move-colima-to-external-ssd.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Move Colima VM datadisk from internal SSD to external ManaData SSD +# +# The Colima VM datadisk lives at ~/.colima/_lima/_disks/colima/datadisk +# and can grow to 200GB (sparse). Moving it to the 3.6TB external SSD +# prevents the internal SSD from filling up and crashing the server. +# +# Usage: bash scripts/mac-mini/move-colima-to-external-ssd.sh +# Run on the Mac Mini server, NOT via Claude Code. + +set -e +export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH" + +COLIMA_DISK_SRC="/Users/mana/.colima/_lima/_disks/colima" +COLIMA_DISK_DST="/Volumes/ManaData/colima-disk" +LOG="/tmp/move-colima.log" + +log() { echo "[$(date '+%H:%M:%S')] $1" | tee -a "$LOG"; } + +# Pre-flight checks +if [ ! -d "/Volumes/ManaData" ]; then + echo "ERROR: /Volumes/ManaData not mounted. Plug in the external SSD first." + exit 1 +fi + +if [ -L "$COLIMA_DISK_SRC" ]; then + echo "Colima disk is already a symlink — already moved. Current target:" + readlink "$COLIMA_DISK_SRC" + exit 0 +fi + +AVAIL_GB=$(df -g /Volumes/ManaData 2>/dev/null | awk 'NR==2 {print $4}') +DISK_GB=$(du -sg "$COLIMA_DISK_SRC" 2>/dev/null | awk '{print $1}') +log "Colima disk size: ~${DISK_GB}GB | ManaData free: ${AVAIL_GB}GB" + +if [ "${DISK_GB:-0}" -ge "${AVAIL_GB:-0}" ]; then + echo "ERROR: Not enough space on ManaData (need ${DISK_GB}GB, have ${AVAIL_GB}GB)" + exit 1 +fi + +echo "" +echo "This will:" +echo " 1. Stop Colima (all Docker containers will stop)" +echo " 2. Copy ${COLIMA_DISK_SRC} → ${COLIMA_DISK_DST}" +echo " 3. Replace original with symlink" +echo " 4. Restart Colima" +echo " 5. Restart all Docker containers" +echo "" +echo "Estimated copy time: ~5-15 minutes depending on actual disk usage." +echo "" +read -p "Continue? (yes/no): " confirm +[ "$confirm" = "yes" ] || { echo "Aborted."; exit 0; } + +# Step 1: Stop Colima +log "Step 1: Stopping Colima..." +colima stop 2>&1 | tee -a "$LOG" +log "Colima stopped." + +# Step 2: Copy disk directory to external SSD +log "Step 2: Copying disk to /Volumes/ManaData/colima-disk ..." +mkdir -p "$COLIMA_DISK_DST" +# Use cp -c for APFS clone-copy (fast) or rsync as fallback +if cp -c -r "$COLIMA_DISK_SRC/." "$COLIMA_DISK_DST/" 2>/dev/null; then + log "Copied via APFS clone (fast)." +else + log "APFS clone not available (cross-volume), using rsync..." + rsync -ah --progress "$COLIMA_DISK_SRC/" "$COLIMA_DISK_DST/" 2>&1 | tee -a "$LOG" +fi +log "Copy complete." + +# Step 3: Replace with symlink +log "Step 3: Replacing original with symlink..." +mv "$COLIMA_DISK_SRC" "${COLIMA_DISK_SRC}.backup-$(date +%Y%m%d)" +ln -s "$COLIMA_DISK_DST" "$COLIMA_DISK_SRC" +log "Symlink created: $COLIMA_DISK_SRC → $COLIMA_DISK_DST" + +# Step 4: Start Colima +log "Step 4: Starting Colima..." +colima start 2>&1 | tee -a "$LOG" +log "Colima started." + +# Step 5: Wait for Docker, then start containers +log "Step 5: Starting Docker containers..." +for i in $(seq 1 12); do + docker info >/dev/null 2>&1 && break + log "Waiting for Docker... ($i/12)" + sleep 5 +done + +cd ~/projects/manacore-monorepo +docker compose -f docker-compose.macmini.yml up -d 2>&1 | tail -5 | tee -a "$LOG" +log "Containers started." + +# Cleanup +log "Step 6: Verifying symlink..." +ls -lah "$COLIMA_DISK_SRC" +df -h /Volumes/ManaData | tail -1 +df -h / | tail -1 + +echo "" +echo "Done. Old disk backed up at: ${COLIMA_DISK_SRC}.backup-$(date +%Y%m%d)" +echo "Once you've verified everything works, delete the backup:" +echo " rm -rf ${COLIMA_DISK_SRC}.backup-*" +echo "" +echo "Log: $LOG"