feat(monitoring): add GlitchTip health check and disk space monitoring

- Add GlitchTip to health-check.sh monitoring endpoints
- Add native disk space checks for / and /Volumes/ManaData with 80%/90% thresholds
- Extend Prometheus disk alerts to include /host_mnt/Volumes/ManaData mountpoint
- Add ManaData disk usage gauge to Grafana system-overview dashboard

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-03-23 09:33:09 +01:00
parent a12ec68fc2
commit c8de944c8d
3 changed files with 81 additions and 7 deletions

View file

@ -36,7 +36,7 @@
},
"overrides": []
},
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 },
"gridPos": { "h": 5, "w": 3, "x": 0, "y": 1 },
"id": 21,
"options": {
"orientation": "auto",
@ -79,7 +79,7 @@
},
"overrides": []
},
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 },
"gridPos": { "h": 5, "w": 3, "x": 3, "y": 1 },
"id": 22,
"options": {
"orientation": "auto",
@ -122,7 +122,7 @@
},
"overrides": []
},
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 },
"gridPos": { "h": 5, "w": 3, "x": 6, "y": 1 },
"id": 23,
"options": {
"orientation": "auto",
@ -145,6 +145,49 @@
"title": "Disk Usage (/)",
"type": "gauge"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": { "h": 5, "w": 3, "x": 9, "y": 1 },
"id": 30,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "10.0.0",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "100 - ((node_filesystem_avail_bytes{mountpoint=~\"/host_mnt/Volumes/ManaData\"} / node_filesystem_size_bytes{mountpoint=~\"/host_mnt/Volumes/ManaData\"}) * 100)",
"refId": "A"
}
],
"title": "Disk Usage (ManaData)",
"type": "gauge"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {

View file

@ -122,8 +122,8 @@ groups:
# High Disk Usage (> 80%)
- alert: HighDiskUsage
expr: |
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"}
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 80
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"}
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 80
for: 10m
labels:
severity: warning
@ -134,8 +134,8 @@ groups:
# Very High Disk Usage (> 90%)
- alert: VeryHighDiskUsage
expr: |
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"}
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 90
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"}
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 90
for: 5m
labels:
severity: critical

View file

@ -273,6 +273,7 @@ echo ""
echo "Monitoring:"
check_service "Grafana" "http://localhost:8000/api/health"
check_service "Umami" "http://localhost:8010/api/heartbeat"
check_service "GlitchTip" "http://localhost:8020/_health/"
check_service "VictoriaMetrics" "http://localhost:9090/health"
echo ""
@ -281,6 +282,36 @@ check_service "vmalert" "http://localhost:8880/health"
check_service "Alertmanager" "http://localhost:9093/-/healthy"
check_service "Alert Notifier" "http://localhost:9095/health"
echo ""
echo "Disk Space:"
check_disk() {
local name=$1
local path=$2
local warn_pct=${3:-80}
local crit_pct=${4:-90}
if [ ! -d "$path" ]; then
echo -e " ${YELLOW}[SKIP]${NC} $name ($path not found)"
return 0
fi
local usage_pct=$(df "$path" | tail -1 | awk '{gsub(/%/,""); print $5}')
local avail=$(df -h "$path" | tail -1 | awk '{print $4}')
if [ "$usage_pct" -ge "$crit_pct" ]; then
echo -e " ${RED}[CRIT]${NC} $name: ${usage_pct}% used ($avail free)"
FAILURES+=("Disk $name: ${usage_pct}% (critical)")
elif [ "$usage_pct" -ge "$warn_pct" ]; then
echo -e " ${YELLOW}[WARN]${NC} $name: ${usage_pct}% used ($avail free)"
FAILURES+=("Disk $name: ${usage_pct}% (warning)")
else
echo -e " ${GREEN}[OK]${NC} $name: ${usage_pct}% used ($avail free)"
fi
}
check_disk "System (/)" "/"
check_disk "ManaData" "/Volumes/ManaData"
echo ""
echo "Cloudflare Tunnel:"
if pgrep -x "cloudflared" >/dev/null; then