feat(monitoring): disk metrics via Pushgateway, Loki in Master Overview, Colima move script

- check-disk-space.sh now pushes mac_disk_used_percent + mac_colima_disk_used_gb
  to Pushgateway every hour so vmalert can alert on real macOS disk usage
- alerts.yml: replace broken node-exporter disk alerts with Pushgateway-based ones
- master-overview.json: add "Recent Errors (Loki)" section with live error log
  stream, error rate timeseries and top error sources barchart
- move-colima-to-external-ssd.sh: guided script to move 200GB Colima VM
  datadisk from internal SSD to /Volumes/ManaData (3.6TB external SSD)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-03-30 20:03:33 +02:00
parent be1096ec85
commit 4e370911e8
3 changed files with 244 additions and 4 deletions

View file

@ -1277,6 +1277,91 @@
],
"title": "New This Month",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 99 },
"id": 900,
"panels": [],
"title": "Recent Errors (Loki)",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 100 },
"id": 901,
"title": "Errors across all services (last 30 min)",
"type": "logs",
"options": {
"showTime": true,
"showLabels": false,
"showCommonLabels": false,
"wrapLogMessage": false,
"prettifyLogMessage": false,
"enableLogDetails": true,
"dedupStrategy": "none",
"sortOrder": "Descending"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "{tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" | tier != \"other\"",
"refId": "A"
}
],
"links": [
{
"title": "Open Logs Explorer",
"url": "/d/logs-explorer",
"targetBlank": false
}
]
},
{
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "bars",
"fillOpacity": 70,
"stacking": { "group": "A", "mode": "normal" }
}
}
},
"gridPos": { "h": 5, "w": 12, "x": 0, "y": 108 },
"id": 902,
"title": "Error Rate by Service (last 1h)",
"type": "timeseries",
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum by (service) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [$__interval]))",
"refId": "A"
}
]
},
{
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "displayMode": "gradient", "showValue": "auto" }
}
},
"gridPos": { "h": 5, "w": 12, "x": 12, "y": 108 },
"id": 903,
"title": "Top Error Sources (last 1h)",
"type": "barchart",
"options": { "xTickLabelRotation": -45 },
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "topk(8, sum by (service) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [1h])))",
"refId": "A",
"instant": true
}
]
}
],
"refresh": "30s",