mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-28 09:37:43 +02:00
feat(monitoring): disk metrics via Pushgateway, Loki in Master Overview, Colima move script
- check-disk-space.sh now pushes mac_disk_used_percent + mac_colima_disk_used_gb to Pushgateway every hour so vmalert can alert on real macOS disk usage - alerts.yml: replace broken node-exporter disk alerts with Pushgateway-based ones - master-overview.json: add "Recent Errors (Loki)" section with live error log stream, error rate timeseries and top error sources barchart - move-colima-to-external-ssd.sh: guided script to move 200GB Colima VM datadisk from internal SSD to /Volumes/ManaData (3.6TB external SSD) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
be1096ec85
commit
4e370911e8
3 changed files with 244 additions and 4 deletions
|
|
@ -1277,6 +1277,91 @@
|
||||||
],
|
],
|
||||||
"title": "New This Month",
|
"title": "New This Month",
|
||||||
"type": "stat"
|
"type": "stat"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 99 },
|
||||||
|
"id": 900,
|
||||||
|
"panels": [],
|
||||||
|
"title": "Recent Errors (Loki)",
|
||||||
|
"type": "row"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 100 },
|
||||||
|
"id": 901,
|
||||||
|
"title": "Errors across all services (last 30 min)",
|
||||||
|
"type": "logs",
|
||||||
|
"options": {
|
||||||
|
"showTime": true,
|
||||||
|
"showLabels": false,
|
||||||
|
"showCommonLabels": false,
|
||||||
|
"wrapLogMessage": false,
|
||||||
|
"prettifyLogMessage": false,
|
||||||
|
"enableLogDetails": true,
|
||||||
|
"dedupStrategy": "none",
|
||||||
|
"sortOrder": "Descending"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"expr": "{tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" | tier != \"other\"",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"links": [
|
||||||
|
{
|
||||||
|
"title": "Open Logs Explorer",
|
||||||
|
"url": "/d/logs-explorer",
|
||||||
|
"targetBlank": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"drawStyle": "bars",
|
||||||
|
"fillOpacity": 70,
|
||||||
|
"stacking": { "group": "A", "mode": "normal" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 12, "x": 0, "y": 108 },
|
||||||
|
"id": 902,
|
||||||
|
"title": "Error Rate by Service (last 1h)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"expr": "sum by (service) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [$__interval]))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "displayMode": "gradient", "showValue": "auto" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 12, "x": 12, "y": 108 },
|
||||||
|
"id": 903,
|
||||||
|
"title": "Top Error Sources (last 1h)",
|
||||||
|
"type": "barchart",
|
||||||
|
"options": { "xTickLabelRotation": -45 },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"expr": "topk(8, sum by (service) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [1h])))",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"refresh": "30s",
|
"refresh": "30s",
|
||||||
|
|
|
||||||
|
|
@ -58,9 +58,48 @@ send_notification() {
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PUSHGATEWAY_URL="${PUSHGATEWAY_URL:-http://localhost:9091}"
|
||||||
|
|
||||||
|
push_disk_metrics() {
|
||||||
|
local disk_label="$1"
|
||||||
|
local mount_point="$2"
|
||||||
|
local usage_pct="$3"
|
||||||
|
local avail_human="$4"
|
||||||
|
local avail_bytes="${5:-0}"
|
||||||
|
local size_bytes="${6:-0}"
|
||||||
|
|
||||||
|
cat <<PROMEOF | curl -s --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/mac_disk/disk/${disk_label}" >/dev/null 2>&1 || true
|
||||||
|
# HELP mac_disk_used_percent Disk usage percent on macOS host
|
||||||
|
# TYPE mac_disk_used_percent gauge
|
||||||
|
mac_disk_used_percent{disk="${disk_label}",mountpoint="${mount_point}",avail_human="${avail_human}"} ${usage_pct}
|
||||||
|
# HELP mac_disk_avail_bytes Disk available bytes on macOS host
|
||||||
|
# TYPE mac_disk_avail_bytes gauge
|
||||||
|
mac_disk_avail_bytes{disk="${disk_label}",mountpoint="${mount_point}"} ${avail_bytes}
|
||||||
|
# HELP mac_disk_size_bytes Total disk size bytes on macOS host
|
||||||
|
# TYPE mac_disk_size_bytes gauge
|
||||||
|
mac_disk_size_bytes{disk="${disk_label}",mountpoint="${mount_point}"} ${size_bytes}
|
||||||
|
PROMEOF
|
||||||
|
}
|
||||||
|
|
||||||
|
push_colima_metrics() {
|
||||||
|
local colima_disk="/Users/mana/.colima/_lima/_disks/colima/datadisk"
|
||||||
|
[ -f "$colima_disk" ] || return 0
|
||||||
|
local used_kb
|
||||||
|
used_kb=$(du -sk "$colima_disk" 2>/dev/null | awk '{print $1}')
|
||||||
|
local used_gb
|
||||||
|
used_gb=$(awk "BEGIN {printf \"%.1f\", ${used_kb:-0} / 1048576}")
|
||||||
|
cat <<PROMEOF | curl -s --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/mac_disk/disk/colima" >/dev/null 2>&1 || true
|
||||||
|
# HELP mac_colima_disk_used_gb Colima VM datadisk actual on-disk usage in GB
|
||||||
|
# TYPE mac_colima_disk_used_gb gauge
|
||||||
|
mac_colima_disk_used_gb ${used_gb}
|
||||||
|
PROMEOF
|
||||||
|
log "Colima VM disk: ${used_gb}GB on disk"
|
||||||
|
}
|
||||||
|
|
||||||
check_disk() {
|
check_disk() {
|
||||||
local mount_point="$1"
|
local mount_point="$1"
|
||||||
local name="$2"
|
local name="$2"
|
||||||
|
local disk_label="${3:-}"
|
||||||
|
|
||||||
# Check if mount point exists
|
# Check if mount point exists
|
||||||
if [ ! -d "$mount_point" ]; then
|
if [ ! -d "$mount_point" ]; then
|
||||||
|
|
@ -77,12 +116,20 @@ check_disk() {
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Get available space
|
# Get available and total space
|
||||||
local available
|
local available
|
||||||
available=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')
|
available=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')
|
||||||
|
local avail_bytes size_bytes
|
||||||
|
avail_bytes=$(df "$mount_point" 2>/dev/null | awk 'NR==2 {print $4 * 512}')
|
||||||
|
size_bytes=$(df "$mount_point" 2>/dev/null | awk 'NR==2 {print $2 * 512}')
|
||||||
|
|
||||||
log "$name: ${usage}% used (${available} free)"
|
log "$name: ${usage}% used (${available} free)"
|
||||||
|
|
||||||
|
# Push metrics to Pushgateway → Prometheus → vmalert → Telegram
|
||||||
|
if [ -n "$disk_label" ]; then
|
||||||
|
push_disk_metrics "$disk_label" "$mount_point" "$usage" "$available" "${avail_bytes:-0}" "${size_bytes:-0}"
|
||||||
|
fi
|
||||||
|
|
||||||
# Check thresholds
|
# Check thresholds
|
||||||
if [ "$usage" -ge "$CRITICAL_THRESHOLD" ]; then
|
if [ "$usage" -ge "$CRITICAL_THRESHOLD" ]; then
|
||||||
log "CRITICAL: $name at ${usage}%!"
|
log "CRITICAL: $name at ${usage}%!"
|
||||||
|
|
@ -204,14 +251,17 @@ log "=== ManaCore Disk Space Check ==="
|
||||||
|
|
||||||
ALERT_STATUS=0
|
ALERT_STATUS=0
|
||||||
|
|
||||||
# Check system disk
|
# Check system disk (internal SSD)
|
||||||
check_disk "/" "System Disk" || ALERT_STATUS=$?
|
check_disk "/" "System Disk" "internal" || ALERT_STATUS=$?
|
||||||
|
|
||||||
# Check ManaData volume (external SSD)
|
# Check ManaData volume (external SSD)
|
||||||
if [ -d "/Volumes/ManaData" ]; then
|
if [ -d "/Volumes/ManaData" ]; then
|
||||||
check_disk "/Volumes/ManaData" "ManaData SSD" || ALERT_STATUS=$?
|
check_disk "/Volumes/ManaData" "ManaData SSD" "manaData" || ALERT_STATUS=$?
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Push Colima VM disk metrics
|
||||||
|
push_colima_metrics
|
||||||
|
|
||||||
# Check Docker disk usage
|
# Check Docker disk usage
|
||||||
check_docker_disk
|
check_docker_disk
|
||||||
|
|
||||||
|
|
|
||||||
105
scripts/mac-mini/move-colima-to-external-ssd.sh
Normal file
105
scripts/mac-mini/move-colima-to-external-ssd.sh
Normal file
|
|
@ -0,0 +1,105 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Move Colima VM datadisk from internal SSD to external ManaData SSD
|
||||||
|
#
|
||||||
|
# The Colima VM datadisk lives at ~/.colima/_lima/_disks/colima/datadisk
|
||||||
|
# and can grow to 200GB (sparse). Moving it to the 3.6TB external SSD
|
||||||
|
# prevents the internal SSD from filling up and crashing the server.
|
||||||
|
#
|
||||||
|
# Usage: bash scripts/mac-mini/move-colima-to-external-ssd.sh
|
||||||
|
# Run on the Mac Mini server, NOT via Claude Code.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
|
||||||
|
|
||||||
|
COLIMA_DISK_SRC="/Users/mana/.colima/_lima/_disks/colima"
|
||||||
|
COLIMA_DISK_DST="/Volumes/ManaData/colima-disk"
|
||||||
|
LOG="/tmp/move-colima.log"
|
||||||
|
|
||||||
|
log() { echo "[$(date '+%H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||||
|
|
||||||
|
# Pre-flight checks
|
||||||
|
if [ ! -d "/Volumes/ManaData" ]; then
|
||||||
|
echo "ERROR: /Volumes/ManaData not mounted. Plug in the external SSD first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -L "$COLIMA_DISK_SRC" ]; then
|
||||||
|
echo "Colima disk is already a symlink — already moved. Current target:"
|
||||||
|
readlink "$COLIMA_DISK_SRC"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
AVAIL_GB=$(df -g /Volumes/ManaData 2>/dev/null | awk 'NR==2 {print $4}')
|
||||||
|
DISK_GB=$(du -sg "$COLIMA_DISK_SRC" 2>/dev/null | awk '{print $1}')
|
||||||
|
log "Colima disk size: ~${DISK_GB}GB | ManaData free: ${AVAIL_GB}GB"
|
||||||
|
|
||||||
|
if [ "${DISK_GB:-0}" -ge "${AVAIL_GB:-0}" ]; then
|
||||||
|
echo "ERROR: Not enough space on ManaData (need ${DISK_GB}GB, have ${AVAIL_GB}GB)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "This will:"
|
||||||
|
echo " 1. Stop Colima (all Docker containers will stop)"
|
||||||
|
echo " 2. Copy ${COLIMA_DISK_SRC} → ${COLIMA_DISK_DST}"
|
||||||
|
echo " 3. Replace original with symlink"
|
||||||
|
echo " 4. Restart Colima"
|
||||||
|
echo " 5. Restart all Docker containers"
|
||||||
|
echo ""
|
||||||
|
echo "Estimated copy time: ~5-15 minutes depending on actual disk usage."
|
||||||
|
echo ""
|
||||||
|
read -p "Continue? (yes/no): " confirm
|
||||||
|
[ "$confirm" = "yes" ] || { echo "Aborted."; exit 0; }
|
||||||
|
|
||||||
|
# Step 1: Stop Colima
|
||||||
|
log "Step 1: Stopping Colima..."
|
||||||
|
colima stop 2>&1 | tee -a "$LOG"
|
||||||
|
log "Colima stopped."
|
||||||
|
|
||||||
|
# Step 2: Copy disk directory to external SSD
|
||||||
|
log "Step 2: Copying disk to /Volumes/ManaData/colima-disk ..."
|
||||||
|
mkdir -p "$COLIMA_DISK_DST"
|
||||||
|
# Use cp -c for APFS clone-copy (fast) or rsync as fallback
|
||||||
|
if cp -c -r "$COLIMA_DISK_SRC/." "$COLIMA_DISK_DST/" 2>/dev/null; then
|
||||||
|
log "Copied via APFS clone (fast)."
|
||||||
|
else
|
||||||
|
log "APFS clone not available (cross-volume), using rsync..."
|
||||||
|
rsync -ah --progress "$COLIMA_DISK_SRC/" "$COLIMA_DISK_DST/" 2>&1 | tee -a "$LOG"
|
||||||
|
fi
|
||||||
|
log "Copy complete."
|
||||||
|
|
||||||
|
# Step 3: Replace with symlink
|
||||||
|
log "Step 3: Replacing original with symlink..."
|
||||||
|
mv "$COLIMA_DISK_SRC" "${COLIMA_DISK_SRC}.backup-$(date +%Y%m%d)"
|
||||||
|
ln -s "$COLIMA_DISK_DST" "$COLIMA_DISK_SRC"
|
||||||
|
log "Symlink created: $COLIMA_DISK_SRC → $COLIMA_DISK_DST"
|
||||||
|
|
||||||
|
# Step 4: Start Colima
|
||||||
|
log "Step 4: Starting Colima..."
|
||||||
|
colima start 2>&1 | tee -a "$LOG"
|
||||||
|
log "Colima started."
|
||||||
|
|
||||||
|
# Step 5: Wait for Docker, then start containers
|
||||||
|
log "Step 5: Starting Docker containers..."
|
||||||
|
for i in $(seq 1 12); do
|
||||||
|
docker info >/dev/null 2>&1 && break
|
||||||
|
log "Waiting for Docker... ($i/12)"
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
cd ~/projects/manacore-monorepo
|
||||||
|
docker compose -f docker-compose.macmini.yml up -d 2>&1 | tail -5 | tee -a "$LOG"
|
||||||
|
log "Containers started."
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
log "Step 6: Verifying symlink..."
|
||||||
|
ls -lah "$COLIMA_DISK_SRC"
|
||||||
|
df -h /Volumes/ManaData | tail -1
|
||||||
|
df -h / | tail -1
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Done. Old disk backed up at: ${COLIMA_DISK_SRC}.backup-$(date +%Y%m%d)"
|
||||||
|
echo "Once you've verified everything works, delete the backup:"
|
||||||
|
echo " rm -rf ${COLIMA_DISK_SRC}.backup-*"
|
||||||
|
echo ""
|
||||||
|
echo "Log: $LOG"
|
||||||
Loading…
Add table
Add a link
Reference in a new issue