mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-22 16:26:43 +02:00
feat(monitoring): add GlitchTip health check and disk space monitoring
- Add GlitchTip to health-check.sh monitoring endpoints - Add native disk space checks for / and /Volumes/ManaData with 80%/90% thresholds - Extend Prometheus disk alerts to include /host_mnt/Volumes/ManaData mountpoint - Add ManaData disk usage gauge to Grafana system-overview dashboard Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a12ec68fc2
commit
c8de944c8d
3 changed files with 81 additions and 7 deletions
|
|
@ -36,7 +36,7 @@
|
||||||
},
|
},
|
||||||
"overrides": []
|
"overrides": []
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 },
|
"gridPos": { "h": 5, "w": 3, "x": 0, "y": 1 },
|
||||||
"id": 21,
|
"id": 21,
|
||||||
"options": {
|
"options": {
|
||||||
"orientation": "auto",
|
"orientation": "auto",
|
||||||
|
|
@ -79,7 +79,7 @@
|
||||||
},
|
},
|
||||||
"overrides": []
|
"overrides": []
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 },
|
"gridPos": { "h": 5, "w": 3, "x": 3, "y": 1 },
|
||||||
"id": 22,
|
"id": 22,
|
||||||
"options": {
|
"options": {
|
||||||
"orientation": "auto",
|
"orientation": "auto",
|
||||||
|
|
@ -122,7 +122,7 @@
|
||||||
},
|
},
|
||||||
"overrides": []
|
"overrides": []
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 },
|
"gridPos": { "h": 5, "w": 3, "x": 6, "y": 1 },
|
||||||
"id": 23,
|
"id": 23,
|
||||||
"options": {
|
"options": {
|
||||||
"orientation": "auto",
|
"orientation": "auto",
|
||||||
|
|
@ -145,6 +145,49 @@
|
||||||
"title": "Disk Usage (/)",
|
"title": "Disk Usage (/)",
|
||||||
"type": "gauge"
|
"type": "gauge"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"mappings": [],
|
||||||
|
"max": 100,
|
||||||
|
"min": 0,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 3, "x": 9, "y": 1 },
|
||||||
|
"id": 30,
|
||||||
|
"options": {
|
||||||
|
"orientation": "auto",
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": ["lastNotNull"],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true
|
||||||
|
},
|
||||||
|
"pluginVersion": "10.0.0",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||||
|
"expr": "100 - ((node_filesystem_avail_bytes{mountpoint=~\"/host_mnt/Volumes/ManaData\"} / node_filesystem_size_bytes{mountpoint=~\"/host_mnt/Volumes/ManaData\"}) * 100)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Disk Usage (ManaData)",
|
||||||
|
"type": "gauge"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
|
|
|
||||||
|
|
@ -122,8 +122,8 @@ groups:
|
||||||
# High Disk Usage (> 80%)
|
# High Disk Usage (> 80%)
|
||||||
- alert: HighDiskUsage
|
- alert: HighDiskUsage
|
||||||
expr: |
|
expr: |
|
||||||
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"}
|
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"}
|
||||||
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 80
|
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 80
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -134,8 +134,8 @@ groups:
|
||||||
# Very High Disk Usage (> 90%)
|
# Very High Disk Usage (> 90%)
|
||||||
- alert: VeryHighDiskUsage
|
- alert: VeryHighDiskUsage
|
||||||
expr: |
|
expr: |
|
||||||
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"}
|
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"}
|
||||||
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 90
|
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/|/host_mnt/Volumes/ManaData"})) * 100 > 90
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
|
||||||
|
|
@ -273,6 +273,7 @@ echo ""
|
||||||
echo "Monitoring:"
|
echo "Monitoring:"
|
||||||
check_service "Grafana" "http://localhost:8000/api/health"
|
check_service "Grafana" "http://localhost:8000/api/health"
|
||||||
check_service "Umami" "http://localhost:8010/api/heartbeat"
|
check_service "Umami" "http://localhost:8010/api/heartbeat"
|
||||||
|
check_service "GlitchTip" "http://localhost:8020/_health/"
|
||||||
check_service "VictoriaMetrics" "http://localhost:9090/health"
|
check_service "VictoriaMetrics" "http://localhost:9090/health"
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
|
|
@ -281,6 +282,36 @@ check_service "vmalert" "http://localhost:8880/health"
|
||||||
check_service "Alertmanager" "http://localhost:9093/-/healthy"
|
check_service "Alertmanager" "http://localhost:9093/-/healthy"
|
||||||
check_service "Alert Notifier" "http://localhost:9095/health"
|
check_service "Alert Notifier" "http://localhost:9095/health"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Disk Space:"
|
||||||
|
check_disk() {
|
||||||
|
local name=$1
|
||||||
|
local path=$2
|
||||||
|
local warn_pct=${3:-80}
|
||||||
|
local crit_pct=${4:-90}
|
||||||
|
|
||||||
|
if [ ! -d "$path" ]; then
|
||||||
|
echo -e " ${YELLOW}[SKIP]${NC} $name ($path not found)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local usage_pct=$(df "$path" | tail -1 | awk '{gsub(/%/,""); print $5}')
|
||||||
|
local avail=$(df -h "$path" | tail -1 | awk '{print $4}')
|
||||||
|
|
||||||
|
if [ "$usage_pct" -ge "$crit_pct" ]; then
|
||||||
|
echo -e " ${RED}[CRIT]${NC} $name: ${usage_pct}% used ($avail free)"
|
||||||
|
FAILURES+=("Disk $name: ${usage_pct}% (critical)")
|
||||||
|
elif [ "$usage_pct" -ge "$warn_pct" ]; then
|
||||||
|
echo -e " ${YELLOW}[WARN]${NC} $name: ${usage_pct}% used ($avail free)"
|
||||||
|
FAILURES+=("Disk $name: ${usage_pct}% (warning)")
|
||||||
|
else
|
||||||
|
echo -e " ${GREEN}[OK]${NC} $name: ${usage_pct}% used ($avail free)"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check_disk "System (/)" "/"
|
||||||
|
check_disk "ManaData" "/Volumes/ManaData"
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Cloudflare Tunnel:"
|
echo "Cloudflare Tunnel:"
|
||||||
if pgrep -x "cloudflared" >/dev/null; then
|
if pgrep -x "cloudflared" >/dev/null; then
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue