mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 18:01:09 +02:00
feat(monitoring): add uptime monitoring via Blackbox Exporter
- scripts/check-status.sh: parallel HTTP check aller mana.how Domains aus cloudflared-config.yml - docker/blackbox/blackbox.yml: Blackbox Exporter Config (http_2xx, http_health Module) - docker-compose.macmini.yml: blackbox-exporter Container (Port 9115, 32MB RAM) - docker/prometheus/prometheus.yml: 4 Scrape-Jobs (blackbox-web, blackbox-api, blackbox-infra, blackbox-gpu) - docker/prometheus/alerts.yml: 5 Alert-Regeln (WebAppDown, APIDown, InfraToolDown, GPUServiceDown, SlowHTTPResponse) - docker/grafana/dashboards/uptime.json: Grafana Uptime-Dashboard mit Status-Tables und Verlauf - package.json: check:status Script Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
bce533ca8b
commit
402baf7c7f
7 changed files with 984 additions and 0 deletions
|
|
@ -1710,6 +1710,23 @@ services:
|
|||
retries: 3
|
||||
start_period: 20s
|
||||
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter:v0.25.0
|
||||
container_name: mana-mon-blackbox
|
||||
restart: always
|
||||
mem_limit: 32m
|
||||
command: ["--config.file=/etc/blackbox/blackbox.yml"]
|
||||
volumes:
|
||||
- ./docker/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro
|
||||
ports:
|
||||
- "9115:9115"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9115/"]
|
||||
interval: 300s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
|
||||
# ============================================
|
||||
# Alerting Stack (Ports 9093-9095)
|
||||
# ============================================
|
||||
|
|
|
|||
24
docker/blackbox/blackbox.yml
Normal file
24
docker/blackbox/blackbox.yml
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
modules:
|
||||
# Standard HTTP check: 2xx/3xx = success
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||
valid_status_codes: [200, 201, 204, 301, 302, 303, 307, 308]
|
||||
method: GET
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: "ip4"
|
||||
ip_protocol_fallback: true
|
||||
|
||||
# Health endpoint check (expects 200 only)
|
||||
http_health:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||
valid_status_codes: [200]
|
||||
method: GET
|
||||
follow_redirects: false
|
||||
preferred_ip_protocol: "ip4"
|
||||
ip_protocol_fallback: true
|
||||
649
docker/grafana/dashboards/uptime.json
Normal file
649
docker/grafana/dashboards/uptime.json
Normal file
|
|
@ -0,0 +1,649 @@
|
|||
{
|
||||
"title": "ManaCore Uptime",
|
||||
"uid": "uptime",
|
||||
"description": "HTTP Uptime aller mana.how Dienste via Blackbox Exporter",
|
||||
"tags": ["uptime", "blackbox", "http"],
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "1m",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "datasource",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"label": "Datasource",
|
||||
"hide": 0,
|
||||
"current": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"type": "row",
|
||||
"id": 1,
|
||||
"title": "Zusammenfassung",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 2,
|
||||
"title": "Web Apps Online",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "green", "value": 18 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"mappings": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "sum(probe_success{job=\"blackbox-web\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "Online"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 3,
|
||||
"title": "APIs Online",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 7 },
|
||||
{ "color": "green", "value": 9 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"mappings": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "sum(probe_success{job=\"blackbox-api\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "Online"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 4,
|
||||
"title": "Infra Online",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 4 },
|
||||
{ "color": "green", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"mappings": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "sum(probe_success{job=\"blackbox-infra\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "Online"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 5,
|
||||
"title": "GPU Services Online",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 2 },
|
||||
{ "color": "green", "value": 4 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"mappings": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "sum(probe_success{job=\"blackbox-gpu\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "Online"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 6,
|
||||
"title": "Ø Antwortzeit (Web)",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 2 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "s",
|
||||
"mappings": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "avg(probe_duration_seconds{job=\"blackbox-web\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "Ø ms"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "row",
|
||||
"id": 10,
|
||||
"title": "Web Apps",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"id": 11,
|
||||
"title": "Web App Status",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 14, "w": 12, "x": 0, "y": 6 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"align": "left",
|
||||
"displayMode": "color-background"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "color": "red" },
|
||||
"1": { "text": "UP", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "instance" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "URL" },
|
||||
{ "id": "custom.width", "value": 260 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Status" },
|
||||
{ "id": "custom.width", "value": 80 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "displayName": "Status", "desc": false }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "job": true, "__name__": true },
|
||||
"indexByName": { "instance": 0, "Value": 1 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "probe_success{job=\"blackbox-web\"}",
|
||||
"instant": true,
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 12,
|
||||
"title": "Web App Antwortzeiten",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 14, "w": 12, "x": 12, "y": 6 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 1, "fillOpacity": 10 },
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": ["lastNotNull", "mean"]
|
||||
},
|
||||
"tooltip": { "mode": "multi" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "probe_duration_seconds{job=\"blackbox-web\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "row",
|
||||
"id": 20,
|
||||
"title": "API Health Endpoints",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"id": 21,
|
||||
"title": "API Status",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 21 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "displayMode": "color-background" },
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "color": "red" },
|
||||
"1": { "text": "UP", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "instance" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Endpoint" },
|
||||
{ "id": "custom.width", "value": 300 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Status" },
|
||||
{ "id": "custom.width", "value": 80 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "displayName": "Status", "desc": false }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "job": true },
|
||||
"indexByName": { "instance": 0, "Value": 1 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "probe_success{job=\"blackbox-api\"}",
|
||||
"instant": true,
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 22,
|
||||
"title": "API Uptime-Verlauf (24h)",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 21 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20 },
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"mappings": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN" }, "1": { "text": "UP" } } }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": ["lastNotNull", "mean"]
|
||||
},
|
||||
"tooltip": { "mode": "multi" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "probe_success{job=\"blackbox-api\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "row",
|
||||
"id": 30,
|
||||
"title": "Infrastruktur & GPU",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"id": 31,
|
||||
"title": "Infra-Dienste Status",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 32 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "displayMode": "color-background" },
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "color": "red" },
|
||||
"1": { "text": "UP", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "instance" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Dienst" },
|
||||
{ "id": "custom.width", "value": 220 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value" },
|
||||
"properties": [{ "id": "displayName", "value": "Status" }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "displayName": "Status", "desc": false }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "job": true },
|
||||
"indexByName": { "instance": 0, "Value": 1 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "probe_success{job=\"blackbox-infra\"}",
|
||||
"instant": true,
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"id": 32,
|
||||
"title": "GPU Server Status",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 32 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "displayMode": "color-background" },
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "color": "red" },
|
||||
"1": { "text": "UP", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "instance" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Dienst" },
|
||||
{ "id": "custom.width", "value": 220 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value" },
|
||||
"properties": [{ "id": "displayName", "value": "Status" }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "displayName": "Status", "desc": false }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "job": true },
|
||||
"indexByName": { "instance": 0, "Value": 1 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "probe_success{job=\"blackbox-gpu\"}",
|
||||
"instant": true,
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 33,
|
||||
"title": "Alle Dienste — Uptime-Verlauf",
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 32 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 },
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "displayMode": "list", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "avg(probe_success{job=\"blackbox-web\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "Web Apps"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "avg(probe_success{job=\"blackbox-api\"})",
|
||||
"refId": "B",
|
||||
"legendFormat": "APIs"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "avg(probe_success{job=\"blackbox-infra\"})",
|
||||
"refId": "C",
|
||||
"legendFormat": "Infra"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "avg(probe_success{job=\"blackbox-gpu\"})",
|
||||
"refId": "D",
|
||||
"legendFormat": "GPU"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -367,6 +367,58 @@ groups:
|
|||
summary: "OIDC token endpoint errors"
|
||||
description: "OIDC token endpoint is returning 5xx errors. SSO may be affected."
|
||||
|
||||
- name: uptime_alerts
|
||||
rules:
|
||||
# Web App offline (HTTP probe failed)
|
||||
- alert: WebAppDown
|
||||
expr: probe_success{job="blackbox-web"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Web App offline: {{ $labels.instance }}"
|
||||
description: "{{ $labels.instance }} hat seit 2 Minuten keine gültige HTTP-Antwort zurückgegeben."
|
||||
|
||||
# API Health Endpoint offline
|
||||
- alert: APIDown
|
||||
expr: probe_success{job="blackbox-api"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API offline: {{ $labels.instance }}"
|
||||
description: "{{ $labels.instance }} antwortet nicht auf den Health-Endpoint."
|
||||
|
||||
# Infra Tool offline (Grafana, Git, etc.)
|
||||
- alert: InfraToolDown
|
||||
expr: probe_success{job="blackbox-infra"} == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Infra-Dienst offline: {{ $labels.instance }}"
|
||||
description: "{{ $labels.instance }} ist seit 3 Minuten nicht erreichbar."
|
||||
|
||||
# GPU Server Service offline
|
||||
- alert: GPUServiceDown
|
||||
expr: probe_success{job="blackbox-gpu"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "GPU-Dienst offline: {{ $labels.instance }}"
|
||||
description: "{{ $labels.instance }} (GPU-Server) ist seit 5 Minuten nicht erreichbar."
|
||||
|
||||
# Slow HTTP response (> 5s)
|
||||
- alert: SlowHTTPResponse
|
||||
expr: probe_duration_seconds{job=~"blackbox-web|blackbox-api"} > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Langsame HTTP-Antwort: {{ $labels.instance }}"
|
||||
description: "{{ $labels.instance }} antwortet mit {{ $value | humanizeDuration }} (> 5s)."
|
||||
|
||||
- name: llm_alerts
|
||||
rules:
|
||||
# mana-llm Down
|
||||
|
|
|
|||
|
|
@ -242,6 +242,111 @@ scrape_configs:
|
|||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# ============================================
|
||||
# Blackbox Exporter — HTTP Uptime Probes
|
||||
# ============================================
|
||||
|
||||
# Web Apps (SvelteKit frontends)
|
||||
- job_name: 'blackbox-web'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://mana.how
|
||||
- https://chat.mana.how
|
||||
- https://todo.mana.how
|
||||
- https://calendar.mana.how
|
||||
- https://contacts.mana.how
|
||||
- https://clock.mana.how
|
||||
- https://photos.mana.how
|
||||
- https://picture.mana.how
|
||||
- https://storage.mana.how
|
||||
- https://presi.mana.how
|
||||
- https://nutriphi.mana.how
|
||||
- https://planta.mana.how
|
||||
- https://calc.mana.how
|
||||
- https://zitare.mana.how
|
||||
- https://manadeck.mana.how
|
||||
- https://skilltree.mana.how
|
||||
- https://mukke.mana.how
|
||||
- https://citycorners.mana.how
|
||||
- https://playground.mana.how
|
||||
- https://whopxl.mana.how
|
||||
- https://arcade.mana.how
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# API Health Endpoints
|
||||
- job_name: 'blackbox-api'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_health]
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://auth.mana.how/health
|
||||
- https://api.mana.how/health
|
||||
- https://chat-api.mana.how/health
|
||||
- https://todo-api.mana.how/health
|
||||
- https://calendar-api.mana.how/health
|
||||
- https://contacts-api.mana.how/health
|
||||
- https://storage-api.mana.how/health
|
||||
- https://nutriphi-api.mana.how/health
|
||||
- https://planta-api.mana.how/health
|
||||
- https://picture-api.mana.how/health
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# Infrastructure & Monitoring Tools
|
||||
- job_name: 'blackbox-infra'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://git.mana.how
|
||||
- https://grafana.mana.how
|
||||
- https://stats.mana.how
|
||||
- https://glitchtip.mana.how
|
||||
- https://matrix.mana.how
|
||||
- https://element.mana.how
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# GPU Server Services
|
||||
- job_name: 'blackbox-gpu'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://gpu-ollama.mana.how
|
||||
- https://gpu-stt.mana.how
|
||||
- https://gpu-tts.mana.how
|
||||
- https://gpu-img.mana.how
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
# ============================================
|
||||
# Pushgateway (deploy metrics, batch jobs)
|
||||
# ============================================
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@
|
|||
"clean": "turbo run clean",
|
||||
"format": "prettier --config .prettierrc.json --write \"**/*.{ts,tsx,js,jsx,json,md,svelte,astro}\"",
|
||||
"format:check": "prettier --config .prettierrc.json --check \"**/*.{ts,tsx,js,jsx,json,md,svelte,astro}\"",
|
||||
"check:status": "bash scripts/check-status.sh",
|
||||
"validate:dockerfiles": "node scripts/validate-dockerfiles.mjs",
|
||||
"audit:deps": "node scripts/audit-workspace-deps.mjs",
|
||||
"generate:dockerfiles": "node scripts/generate-dockerfiles.mjs",
|
||||
|
|
|
|||
136
scripts/check-status.sh
Executable file
136
scripts/check-status.sh
Executable file
|
|
@ -0,0 +1,136 @@
|
|||
#!/usr/bin/env bash
|
||||
# Requires: bash 3+, curl, python3 (for yaml parsing) or grep
|
||||
# check-status.sh — Prüft die Erreichbarkeit aller mana.how-Dienste
|
||||
# Liest direkt aus cloudflared-config.yml (Single Source of Truth)
|
||||
# Usage: ./scripts/check-status.sh [--internal]
|
||||
# --internal Prüft interne Ports statt externe Domains
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
CLOUDFLARED_CONFIG="$REPO_ROOT/cloudflared-config.yml"
|
||||
TIMEOUT=8
|
||||
INTERNAL=false
|
||||
|
||||
[[ "${1:-}" == "--internal" ]] && INTERNAL=true
|
||||
|
||||
# Farben
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
GRAY='\033[0;90m'
|
||||
BOLD='\033[1m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Zähler
|
||||
ok=0; warn=0; fail=0; total=0
|
||||
|
||||
# Temporäres Verzeichnis für parallele Ergebnisse
|
||||
tmpdir=$(mktemp -d)
|
||||
trap 'rm -rf "$tmpdir"' EXIT
|
||||
|
||||
check_url() {
|
||||
local url="$1"
|
||||
local label="$2"
|
||||
local outfile="$3"
|
||||
|
||||
# Note: curl outputs "000" itself on connection failure, so no || fallback needed
|
||||
local code
|
||||
code=$(curl -o /dev/null -s -w "%{http_code}" --max-time "$TIMEOUT" "$url" 2>/dev/null)
|
||||
|
||||
local icon
|
||||
if [[ "$code" =~ ^(200|201|204|301|302|303|307|308)$ ]]; then
|
||||
icon="✅"
|
||||
elif [[ "$code" =~ ^4 ]]; then
|
||||
# 4xx = server reachable, wrong path (API root returns 404 — use health endpoint)
|
||||
icon="⚠️"
|
||||
elif [[ "$code" == "000" ]]; then
|
||||
icon="⏱"
|
||||
else
|
||||
# 5xx or unknown
|
||||
icon="❌"
|
||||
fi
|
||||
|
||||
printf "%s|%s|%s|%s\n" "$icon" "$code" "$label" "$url" > "$outfile"
|
||||
}
|
||||
|
||||
echo ""
|
||||
echo -e "${BOLD}ManaCore Service Status${NC} $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
|
||||
# Alle Hostnamen aus cloudflared-config.yml extrahieren (ohne ssh.mana.how)
|
||||
# Bash 3-kompatibel (kein mapfile)
|
||||
hostnames=()
|
||||
while IFS= read -r host; do
|
||||
hostnames+=("$host")
|
||||
done < <(
|
||||
grep "hostname:" "$CLOUDFLARED_CONFIG" \
|
||||
| awk '{print $3}' \
|
||||
| grep -v "^ssh\." \
|
||||
| sort -u
|
||||
)
|
||||
|
||||
# Parallel prüfen
|
||||
i=0
|
||||
for host in "${hostnames[@]}"; do
|
||||
url="https://$host"
|
||||
check_url "$url" "$host" "$tmpdir/$i" &
|
||||
i=$((i + 1))
|
||||
done
|
||||
wait
|
||||
|
||||
# Ergebnisse sammeln und sortieren
|
||||
declare -a results_ok=()
|
||||
declare -a results_warn=()
|
||||
declare -a results_fail=()
|
||||
declare -a results_4xx=()
|
||||
|
||||
for f in "$tmpdir"/*; do
|
||||
[[ -f "$f" ]] || continue
|
||||
IFS='|' read -r icon code label url < "$f"
|
||||
total=$((total + 1))
|
||||
line=$(printf " %s %-38s %s %s" "$icon" "$label" "$code" "$url")
|
||||
if [[ "$icon" == "✅" ]]; then
|
||||
results_ok+=("$line")
|
||||
ok=$((ok + 1))
|
||||
elif [[ "$icon" == "⏱" ]]; then
|
||||
results_warn+=("$line")
|
||||
warn=$((warn + 1))
|
||||
elif [[ "$icon" == "⚠️" ]]; then
|
||||
results_4xx+=("$line")
|
||||
else
|
||||
results_fail+=("$line")
|
||||
fail=$((fail + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
# Ausgabe
|
||||
if [[ ${#results_ok[@]} -gt 0 ]]; then
|
||||
echo ""
|
||||
echo -e "${GREEN}${BOLD}ONLINE (${#results_ok[@]})${NC}"
|
||||
for line in "${results_ok[@]}"; do echo -e "${GREEN}${line}${NC}"; done
|
||||
fi
|
||||
|
||||
if [[ ${#results_4xx[@]} -gt 0 ]]; then
|
||||
echo ""
|
||||
echo -e "${YELLOW}${BOLD}ERREICHBAR / 4xx — Root-Pfad nicht definiert (${#results_4xx[@]})${NC}"
|
||||
for line in "${results_4xx[@]}"; do echo -e "${YELLOW}${line}${NC}"; done
|
||||
fi
|
||||
|
||||
if [[ ${#results_fail[@]} -gt 0 ]]; then
|
||||
echo ""
|
||||
echo -e "${RED}${BOLD}NICHT ERREICHBAR / 5xx (${#results_fail[@]})${NC}"
|
||||
for line in "${results_fail[@]}"; do echo -e "${RED}${line}${NC}"; done
|
||||
fi
|
||||
|
||||
if [[ ${#results_warn[@]} -gt 0 ]]; then
|
||||
echo ""
|
||||
echo -e "${YELLOW}${BOLD}TIMEOUT / KEIN DNS (${#results_warn[@]})${NC}"
|
||||
for line in "${results_warn[@]}"; do echo -e "${YELLOW}${line}${NC}"; done
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
echo -e " ${GREEN}✅ Online: $ok${NC} ${RED}❌ Down: $fail${NC} ${YELLOW}⏱ Timeout: $warn${NC} (Gesamt: $total)"
|
||||
echo ""
|
||||
Loading…
Add table
Add a link
Reference in a new issue