mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 21:01:08 +02:00
feat(monitoring): add GlitchTip health check and disk space monitoring
- Add GlitchTip to health-check.sh monitoring endpoints - Add native disk space checks for / and /Volumes/ManaData with 80%/90% thresholds - Extend Prometheus disk alerts to include /host_mnt/Volumes/ManaData mountpoint - Add ManaData disk usage gauge to Grafana system-overview dashboard Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a12ec68fc2
commit
c8de944c8d
3 changed files with 81 additions and 7 deletions
|
|
@ -36,7 +36,7 @@
|
|||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 },
|
||||
"gridPos": { "h": 5, "w": 3, "x": 0, "y": 1 },
|
||||
"id": 21,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
|
|
@ -79,7 +79,7 @@
|
|||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 },
|
||||
"gridPos": { "h": 5, "w": 3, "x": 3, "y": 1 },
|
||||
"id": 22,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
|
|
@ -122,7 +122,7 @@
|
|||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 },
|
||||
"gridPos": { "h": 5, "w": 3, "x": 6, "y": 1 },
|
||||
"id": 23,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
|
|
@ -145,6 +145,49 @@
|
|||
"title": "Disk Usage (/)",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 3, "x": 9, "y": 1 },
|
||||
"id": 30,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{mountpoint=~\"/host_mnt/Volumes/ManaData\"} / node_filesystem_size_bytes{mountpoint=~\"/host_mnt/Volumes/ManaData\"}) * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Disk Usage (ManaData)",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"fieldConfig": {
|
||||
|
|
|
|||
|
|
@ -122,8 +122,8 @@ groups:
|
|||
# High Disk Usage (> 80%)
|
||||
- alert: HighDiskUsage
|
||||
expr: |
|
||||
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"}
|
||||
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 80
|
||||
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"}
|
||||
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -134,8 +134,8 @@ groups:
|
|||
# Very High Disk Usage (> 90%)
|
||||
- alert: VeryHighDiskUsage
|
||||
expr: |
|
||||
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"}
|
||||
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 90
|
||||
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"}
|
||||
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
|
|
@ -273,6 +273,7 @@ echo ""
|
|||
echo "Monitoring:"
|
||||
check_service "Grafana" "http://localhost:8000/api/health"
|
||||
check_service "Umami" "http://localhost:8010/api/heartbeat"
|
||||
check_service "GlitchTip" "http://localhost:8020/_health/"
|
||||
check_service "VictoriaMetrics" "http://localhost:9090/health"
|
||||
|
||||
echo ""
|
||||
|
|
@ -281,6 +282,36 @@ check_service "vmalert" "http://localhost:8880/health"
|
|||
check_service "Alertmanager" "http://localhost:9093/-/healthy"
|
||||
check_service "Alert Notifier" "http://localhost:9095/health"
|
||||
|
||||
echo ""
|
||||
echo "Disk Space:"
|
||||
check_disk() {
|
||||
local name=$1
|
||||
local path=$2
|
||||
local warn_pct=${3:-80}
|
||||
local crit_pct=${4:-90}
|
||||
|
||||
if [ ! -d "$path" ]; then
|
||||
echo -e " ${YELLOW}[SKIP]${NC} $name ($path not found)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local usage_pct=$(df "$path" | tail -1 | awk '{gsub(/%/,""); print $5}')
|
||||
local avail=$(df -h "$path" | tail -1 | awk '{print $4}')
|
||||
|
||||
if [ "$usage_pct" -ge "$crit_pct" ]; then
|
||||
echo -e " ${RED}[CRIT]${NC} $name: ${usage_pct}% used ($avail free)"
|
||||
FAILURES+=("Disk $name: ${usage_pct}% (critical)")
|
||||
elif [ "$usage_pct" -ge "$warn_pct" ]; then
|
||||
echo -e " ${YELLOW}[WARN]${NC} $name: ${usage_pct}% used ($avail free)"
|
||||
FAILURES+=("Disk $name: ${usage_pct}% (warning)")
|
||||
else
|
||||
echo -e " ${GREEN}[OK]${NC} $name: ${usage_pct}% used ($avail free)"
|
||||
fi
|
||||
}
|
||||
|
||||
check_disk "System (/)" "/"
|
||||
check_disk "ManaData" "/Volumes/ManaData"
|
||||
|
||||
echo ""
|
||||
echo "Cloudflare Tunnel:"
|
||||
if pgrep -x "cloudflared" >/dev/null; then
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue