diff --git a/docker/grafana/dashboards/system-overview.json b/docker/grafana/dashboards/system-overview.json index 1492347f5..7f1745865 100644 --- a/docker/grafana/dashboards/system-overview.json +++ b/docker/grafana/dashboards/system-overview.json @@ -36,7 +36,7 @@ }, "overrides": [] }, - "gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }, + "gridPos": { "h": 5, "w": 3, "x": 0, "y": 1 }, "id": 21, "options": { "orientation": "auto", @@ -79,7 +79,7 @@ }, "overrides": [] }, - "gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }, + "gridPos": { "h": 5, "w": 3, "x": 3, "y": 1 }, "id": 22, "options": { "orientation": "auto", @@ -122,7 +122,7 @@ }, "overrides": [] }, - "gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }, + "gridPos": { "h": 5, "w": 3, "x": 6, "y": 1 }, "id": 23, "options": { "orientation": "auto", @@ -145,6 +145,49 @@ "title": "Disk Usage (/)", "type": "gauge" }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 3, "x": 9, "y": 1 }, + "id": 30, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "100 - ((node_filesystem_avail_bytes{mountpoint=~\"/host_mnt/Volumes/ManaData\"} / node_filesystem_size_bytes{mountpoint=~\"/host_mnt/Volumes/ManaData\"}) * 100)", + "refId": "A" + } + ], + "title": "Disk Usage (ManaData)", + "type": "gauge" + }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { diff --git a/docker/prometheus/alerts.yml b/docker/prometheus/alerts.yml index 18639b03a..0b7266b22 100644 --- a/docker/prometheus/alerts.yml +++ b/docker/prometheus/alerts.yml @@ -122,8 +122,8 @@ groups: # High Disk Usage (> 80%) - alert: HighDiskUsage expr: | - (1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"} - / node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 80 + (1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"} + / node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 80 for: 10m labels: severity: warning @@ -134,8 +134,8 @@ groups: # Very High Disk Usage (> 90%) - alert: VeryHighDiskUsage expr: | - (1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"} - / node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 90 + (1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"} + / node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 90 for: 5m labels: severity: critical diff --git a/scripts/mac-mini/health-check.sh b/scripts/mac-mini/health-check.sh index a4342df73..539640c6a 100755 --- a/scripts/mac-mini/health-check.sh +++ b/scripts/mac-mini/health-check.sh @@ -273,6 +273,7 @@ echo "" echo "Monitoring:" check_service "Grafana" "http://localhost:8000/api/health" check_service "Umami" "http://localhost:8010/api/heartbeat" +check_service "GlitchTip" "http://localhost:8020/_health/" check_service "VictoriaMetrics" "http://localhost:9090/health" echo "" @@ -281,6 +282,36 @@ check_service "vmalert" "http://localhost:8880/health" check_service "Alertmanager" "http://localhost:9093/-/healthy" check_service "Alert Notifier" "http://localhost:9095/health" +echo "" +echo "Disk Space:" +check_disk() { + local name=$1 + local path=$2 + local warn_pct=${3:-80} + local crit_pct=${4:-90} + + if [ ! -d "$path" ]; then + echo -e " ${YELLOW}[SKIP]${NC} $name ($path not found)" + return 0 + fi + + local usage_pct=$(df "$path" | tail -1 | awk '{gsub(/%/,""); print $5}') + local avail=$(df -h "$path" | tail -1 | awk '{print $4}') + + if [ "$usage_pct" -ge "$crit_pct" ]; then + echo -e " ${RED}[CRIT]${NC} $name: ${usage_pct}% used ($avail free)" + FAILURES+=("Disk $name: ${usage_pct}% (critical)") + elif [ "$usage_pct" -ge "$warn_pct" ]; then + echo -e " ${YELLOW}[WARN]${NC} $name: ${usage_pct}% used ($avail free)" + FAILURES+=("Disk $name: ${usage_pct}% (warning)") + else + echo -e " ${GREEN}[OK]${NC} $name: ${usage_pct}% used ($avail free)" + fi +} + +check_disk "System (/)" "/" +check_disk "ManaData" "/Volumes/ManaData" + echo "" echo "Cloudflare Tunnel:" if pgrep -x "cloudflared" >/dev/null; then