From 402baf7c7f9d57f21c918755004453b954164123 Mon Sep 17 00:00:00 2001 From: Till JS Date: Tue, 31 Mar 2026 17:43:25 +0200 Subject: [PATCH] feat(monitoring): add uptime monitoring via Blackbox Exporter - scripts/check-status.sh: parallel HTTP check aller mana.how Domains aus cloudflared-config.yml - docker/blackbox/blackbox.yml: Blackbox Exporter Config (http_2xx, http_health Module) - docker-compose.macmini.yml: blackbox-exporter Container (Port 9115, 32MB RAM) - docker/prometheus/prometheus.yml: 4 Scrape-Jobs (blackbox-web, blackbox-api, blackbox-infra, blackbox-gpu) - docker/prometheus/alerts.yml: 5 Alert-Regeln (WebAppDown, APIDown, InfraToolDown, GPUServiceDown, SlowHTTPResponse) - docker/grafana/dashboards/uptime.json: Grafana Uptime-Dashboard mit Status-Tables und Verlauf - package.json: check:status Script Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.macmini.yml | 17 + docker/blackbox/blackbox.yml | 24 + docker/grafana/dashboards/uptime.json | 649 ++++++++++++++++++++++++++ docker/prometheus/alerts.yml | 52 +++ docker/prometheus/prometheus.yml | 105 +++++ package.json | 1 + scripts/check-status.sh | 136 ++++++ 7 files changed, 984 insertions(+) create mode 100644 docker/blackbox/blackbox.yml create mode 100644 docker/grafana/dashboards/uptime.json create mode 100755 scripts/check-status.sh diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index e14065734..aae48ff3b 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -1710,6 +1710,23 @@ services: retries: 3 start_period: 20s + blackbox-exporter: + image: prom/blackbox-exporter:v0.25.0 + container_name: mana-mon-blackbox + restart: always + mem_limit: 32m + command: ["--config.file=/etc/blackbox/blackbox.yml"] + volumes: + - ./docker/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro + ports: + - "9115:9115" + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9115/"] + interval: 300s + timeout: 10s + retries: 3 + start_period: 10s + # ============================================ # Alerting Stack (Ports 9093-9095) # ============================================ diff --git a/docker/blackbox/blackbox.yml b/docker/blackbox/blackbox.yml new file mode 100644 index 000000000..d6719686c --- /dev/null +++ b/docker/blackbox/blackbox.yml @@ -0,0 +1,24 @@ +modules: + # Standard HTTP check: 2xx/3xx = success + http_2xx: + prober: http + timeout: 10s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200, 201, 204, 301, 302, 303, 307, 308] + method: GET + follow_redirects: true + preferred_ip_protocol: "ip4" + ip_protocol_fallback: true + + # Health endpoint check (expects 200 only) + http_health: + prober: http + timeout: 10s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200] + method: GET + follow_redirects: false + preferred_ip_protocol: "ip4" + ip_protocol_fallback: true diff --git a/docker/grafana/dashboards/uptime.json b/docker/grafana/dashboards/uptime.json new file mode 100644 index 000000000..d58083e6e --- /dev/null +++ b/docker/grafana/dashboards/uptime.json @@ -0,0 +1,649 @@ +{ + "title": "ManaCore Uptime", + "uid": "uptime", + "description": "HTTP Uptime aller mana.how Dienste via Blackbox Exporter", + "tags": ["uptime", "blackbox", "http"], + "schemaVersion": 38, + "version": 1, + "refresh": "1m", + "time": { "from": "now-24h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "pluginId": "prometheus", + "label": "Datasource", + "hide": 0, + "current": {} + } + ] + }, + "panels": [ + { + "type": "row", + "id": 1, + "title": "Zusammenfassung", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + { + "type": "stat", + "id": 2, + "title": "Web Apps Online", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "green", "value": 18 } + ] + }, + "unit": "short", + "mappings": [] + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(probe_success{job=\"blackbox-web\"})", + "refId": "A", + "legendFormat": "Online" + } + ] + }, + { + "type": "stat", + "id": 3, + "title": "APIs Online", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 7 }, + { "color": "green", "value": 9 } + ] + }, + "unit": "short", + "mappings": [] + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(probe_success{job=\"blackbox-api\"})", + "refId": "A", + "legendFormat": "Online" + } + ] + }, + { + "type": "stat", + "id": 4, + "title": "Infra Online", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 4 }, + { "color": "green", "value": 5 } + ] + }, + "unit": "short", + "mappings": [] + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(probe_success{job=\"blackbox-infra\"})", + "refId": "A", + "legendFormat": "Online" + } + ] + }, + { + "type": "stat", + "id": 5, + "title": "GPU Services Online", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 2 }, + { "color": "green", "value": 4 } + ] + }, + "unit": "short", + "mappings": [] + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(probe_success{job=\"blackbox-gpu\"})", + "refId": "A", + "legendFormat": "Online" + } + ] + }, + { + "type": "stat", + "id": 6, + "title": "Ø Antwortzeit (Web)", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 2 }, + { "color": "red", "value": 5 } + ] + }, + "unit": "s", + "mappings": [] + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg(probe_duration_seconds{job=\"blackbox-web\"})", + "refId": "A", + "legendFormat": "Ø ms" + } + ] + }, + { + "type": "row", + "id": 10, + "title": "Web Apps", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 } + }, + { + "type": "table", + "id": 11, + "title": "Web App Status", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 14, "w": 12, "x": 0, "y": 6 }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "displayMode": "color-background" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "color": "red" }, + "1": { "text": "UP", "color": "green" } + } + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "instance" }, + "properties": [ + { "id": "displayName", "value": "URL" }, + { "id": "custom.width", "value": 260 } + ] + }, + { + "matcher": { "id": "byName", "options": "Value" }, + "properties": [ + { "id": "displayName", "value": "Status" }, + { "id": "custom.width", "value": 80 } + ] + } + ] + }, + "options": { + "sortBy": [{ "displayName": "Status", "desc": false }], + "footer": { "show": false } + }, + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "job": true, "__name__": true }, + "indexByName": { "instance": 0, "Value": 1 } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "probe_success{job=\"blackbox-web\"}", + "instant": true, + "refId": "A", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "id": 12, + "title": "Web App Antwortzeiten", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 14, "w": 12, "x": 12, "y": 6 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 10 }, + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 5 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": ["lastNotNull", "mean"] + }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "probe_duration_seconds{job=\"blackbox-web\"}", + "refId": "A", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "row", + "id": 20, + "title": "API Health Endpoints", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 } + }, + { + "type": "table", + "id": 21, + "title": "API Status", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 21 }, + "fieldConfig": { + "defaults": { + "custom": { "align": "left", "displayMode": "color-background" }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "color": "red" }, + "1": { "text": "UP", "color": "green" } + } + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "instance" }, + "properties": [ + { "id": "displayName", "value": "Endpoint" }, + { "id": "custom.width", "value": 300 } + ] + }, + { + "matcher": { "id": "byName", "options": "Value" }, + "properties": [ + { "id": "displayName", "value": "Status" }, + { "id": "custom.width", "value": 80 } + ] + } + ] + }, + "options": { + "sortBy": [{ "displayName": "Status", "desc": false }], + "footer": { "show": false } + }, + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "job": true }, + "indexByName": { "instance": 0, "Value": 1 } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "probe_success{job=\"blackbox-api\"}", + "instant": true, + "refId": "A", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "id": 22, + "title": "API Uptime-Verlauf (24h)", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 21 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 20 }, + "unit": "short", + "min": 0, + "max": 1, + "mappings": [ + { "type": "value", "options": { "0": { "text": "DOWN" }, "1": { "text": "UP" } } } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": ["lastNotNull", "mean"] + }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "probe_success{job=\"blackbox-api\"}", + "refId": "A", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "row", + "id": 30, + "title": "Infrastruktur & GPU", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 } + }, + { + "type": "table", + "id": 31, + "title": "Infra-Dienste Status", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 32 }, + "fieldConfig": { + "defaults": { + "custom": { "align": "left", "displayMode": "color-background" }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "color": "red" }, + "1": { "text": "UP", "color": "green" } + } + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "instance" }, + "properties": [ + { "id": "displayName", "value": "Dienst" }, + { "id": "custom.width", "value": 220 } + ] + }, + { + "matcher": { "id": "byName", "options": "Value" }, + "properties": [{ "id": "displayName", "value": "Status" }] + } + ] + }, + "options": { + "sortBy": [{ "displayName": "Status", "desc": false }], + "footer": { "show": false } + }, + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "job": true }, + "indexByName": { "instance": 0, "Value": 1 } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "probe_success{job=\"blackbox-infra\"}", + "instant": true, + "refId": "A", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "table", + "id": 32, + "title": "GPU Server Status", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 32 }, + "fieldConfig": { + "defaults": { + "custom": { "align": "left", "displayMode": "color-background" }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "color": "red" }, + "1": { "text": "UP", "color": "green" } + } + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "instance" }, + "properties": [ + { "id": "displayName", "value": "Dienst" }, + { "id": "custom.width", "value": 220 } + ] + }, + { + "matcher": { "id": "byName", "options": "Value" }, + "properties": [{ "id": "displayName", "value": "Status" }] + } + ] + }, + "options": { + "sortBy": [{ "displayName": "Status", "desc": false }], + "footer": { "show": false } + }, + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "job": true }, + "indexByName": { "instance": 0, "Value": 1 } + } + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "probe_success{job=\"blackbox-gpu\"}", + "instant": true, + "refId": "A", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "id": 33, + "title": "Alle Dienste — Uptime-Verlauf", + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 32 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10 }, + "unit": "short", + "min": 0, + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg(probe_success{job=\"blackbox-web\"})", + "refId": "A", + "legendFormat": "Web Apps" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg(probe_success{job=\"blackbox-api\"})", + "refId": "B", + "legendFormat": "APIs" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg(probe_success{job=\"blackbox-infra\"})", + "refId": "C", + "legendFormat": "Infra" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg(probe_success{job=\"blackbox-gpu\"})", + "refId": "D", + "legendFormat": "GPU" + } + ] + } + ] +} diff --git a/docker/prometheus/alerts.yml b/docker/prometheus/alerts.yml index 6e89ec8cf..0cc4ca4cd 100644 --- a/docker/prometheus/alerts.yml +++ b/docker/prometheus/alerts.yml @@ -367,6 +367,58 @@ groups: summary: "OIDC token endpoint errors" description: "OIDC token endpoint is returning 5xx errors. SSO may be affected." + - name: uptime_alerts + rules: + # Web App offline (HTTP probe failed) + - alert: WebAppDown + expr: probe_success{job="blackbox-web"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Web App offline: {{ $labels.instance }}" + description: "{{ $labels.instance }} hat seit 2 Minuten keine gültige HTTP-Antwort zurückgegeben." + + # API Health Endpoint offline + - alert: APIDown + expr: probe_success{job="blackbox-api"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "API offline: {{ $labels.instance }}" + description: "{{ $labels.instance }} antwortet nicht auf den Health-Endpoint." + + # Infra Tool offline (Grafana, Git, etc.) + - alert: InfraToolDown + expr: probe_success{job="blackbox-infra"} == 0 + for: 3m + labels: + severity: warning + annotations: + summary: "Infra-Dienst offline: {{ $labels.instance }}" + description: "{{ $labels.instance }} ist seit 3 Minuten nicht erreichbar." + + # GPU Server Service offline + - alert: GPUServiceDown + expr: probe_success{job="blackbox-gpu"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "GPU-Dienst offline: {{ $labels.instance }}" + description: "{{ $labels.instance }} (GPU-Server) ist seit 5 Minuten nicht erreichbar." + + # Slow HTTP response (> 5s) + - alert: SlowHTTPResponse + expr: probe_duration_seconds{job=~"blackbox-web|blackbox-api"} > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Langsame HTTP-Antwort: {{ $labels.instance }}" + description: "{{ $labels.instance }} antwortet mit {{ $value | humanizeDuration }} (> 5s)." + - name: llm_alerts rules: # mana-llm Down diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 4288b2e53..e418e0e1d 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -242,6 +242,111 @@ scrape_configs: metrics_path: '/metrics' scrape_interval: 30s + # ============================================ + # Blackbox Exporter — HTTP Uptime Probes + # ============================================ + + # Web Apps (SvelteKit frontends) + - job_name: 'blackbox-web' + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - https://mana.how + - https://chat.mana.how + - https://todo.mana.how + - https://calendar.mana.how + - https://contacts.mana.how + - https://clock.mana.how + - https://photos.mana.how + - https://picture.mana.how + - https://storage.mana.how + - https://presi.mana.how + - https://nutriphi.mana.how + - https://planta.mana.how + - https://calc.mana.how + - https://zitare.mana.how + - https://manadeck.mana.how + - https://skilltree.mana.how + - https://mukke.mana.how + - https://citycorners.mana.how + - https://playground.mana.how + - https://whopxl.mana.how + - https://arcade.mana.how + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # API Health Endpoints + - job_name: 'blackbox-api' + metrics_path: /probe + params: + module: [http_health] + static_configs: + - targets: + - https://auth.mana.how/health + - https://api.mana.how/health + - https://chat-api.mana.how/health + - https://todo-api.mana.how/health + - https://calendar-api.mana.how/health + - https://contacts-api.mana.how/health + - https://storage-api.mana.how/health + - https://nutriphi-api.mana.how/health + - https://planta-api.mana.how/health + - https://picture-api.mana.how/health + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # Infrastructure & Monitoring Tools + - job_name: 'blackbox-infra' + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - https://git.mana.how + - https://grafana.mana.how + - https://stats.mana.how + - https://glitchtip.mana.how + - https://matrix.mana.how + - https://element.mana.how + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + # GPU Server Services + - job_name: 'blackbox-gpu' + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - https://gpu-ollama.mana.how + - https://gpu-stt.mana.how + - https://gpu-tts.mana.how + - https://gpu-img.mana.how + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + # ============================================ # Pushgateway (deploy metrics, batch jobs) # ============================================ diff --git a/package.json b/package.json index 1b814f14c..591f8732f 100644 --- a/package.json +++ b/package.json @@ -16,6 +16,7 @@ "clean": "turbo run clean", "format": "prettier --config .prettierrc.json --write \"**/*.{ts,tsx,js,jsx,json,md,svelte,astro}\"", "format:check": "prettier --config .prettierrc.json --check \"**/*.{ts,tsx,js,jsx,json,md,svelte,astro}\"", + "check:status": "bash scripts/check-status.sh", "validate:dockerfiles": "node scripts/validate-dockerfiles.mjs", "audit:deps": "node scripts/audit-workspace-deps.mjs", "generate:dockerfiles": "node scripts/generate-dockerfiles.mjs", diff --git a/scripts/check-status.sh b/scripts/check-status.sh new file mode 100755 index 000000000..cf3903b5f --- /dev/null +++ b/scripts/check-status.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# Requires: bash 3+, curl, python3 (for yaml parsing) or grep +# check-status.sh — Prüft die Erreichbarkeit aller mana.how-Dienste +# Liest direkt aus cloudflared-config.yml (Single Source of Truth) +# Usage: ./scripts/check-status.sh [--internal] +# --internal Prüft interne Ports statt externe Domains + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +CLOUDFLARED_CONFIG="$REPO_ROOT/cloudflared-config.yml" +TIMEOUT=8 +INTERNAL=false + +[[ "${1:-}" == "--internal" ]] && INTERNAL=true + +# Farben +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +GRAY='\033[0;90m' +BOLD='\033[1m' +NC='\033[0m' + +# Zähler +ok=0; warn=0; fail=0; total=0 + +# Temporäres Verzeichnis für parallele Ergebnisse +tmpdir=$(mktemp -d) +trap 'rm -rf "$tmpdir"' EXIT + +check_url() { + local url="$1" + local label="$2" + local outfile="$3" + + # Note: curl outputs "000" itself on connection failure, so no || fallback needed + local code + code=$(curl -o /dev/null -s -w "%{http_code}" --max-time "$TIMEOUT" "$url" 2>/dev/null) + + local icon + if [[ "$code" =~ ^(200|201|204|301|302|303|307|308)$ ]]; then + icon="✅" + elif [[ "$code" =~ ^4 ]]; then + # 4xx = server reachable, wrong path (API root returns 404 — use health endpoint) + icon="⚠️" + elif [[ "$code" == "000" ]]; then + icon="⏱" + else + # 5xx or unknown + icon="❌" + fi + + printf "%s|%s|%s|%s\n" "$icon" "$code" "$label" "$url" > "$outfile" +} + +echo "" +echo -e "${BOLD}ManaCore Service Status${NC} $(date '+%Y-%m-%d %H:%M:%S')" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + +# Alle Hostnamen aus cloudflared-config.yml extrahieren (ohne ssh.mana.how) +# Bash 3-kompatibel (kein mapfile) +hostnames=() +while IFS= read -r host; do + hostnames+=("$host") +done < <( + grep "hostname:" "$CLOUDFLARED_CONFIG" \ + | awk '{print $3}' \ + | grep -v "^ssh\." \ + | sort -u +) + +# Parallel prüfen +i=0 +for host in "${hostnames[@]}"; do + url="https://$host" + check_url "$url" "$host" "$tmpdir/$i" & + i=$((i + 1)) +done +wait + +# Ergebnisse sammeln und sortieren +declare -a results_ok=() +declare -a results_warn=() +declare -a results_fail=() +declare -a results_4xx=() + +for f in "$tmpdir"/*; do + [[ -f "$f" ]] || continue + IFS='|' read -r icon code label url < "$f" + total=$((total + 1)) + line=$(printf " %s %-38s %s %s" "$icon" "$label" "$code" "$url") + if [[ "$icon" == "✅" ]]; then + results_ok+=("$line") + ok=$((ok + 1)) + elif [[ "$icon" == "⏱" ]]; then + results_warn+=("$line") + warn=$((warn + 1)) + elif [[ "$icon" == "⚠️" ]]; then + results_4xx+=("$line") + else + results_fail+=("$line") + fail=$((fail + 1)) + fi +done + +# Ausgabe +if [[ ${#results_ok[@]} -gt 0 ]]; then + echo "" + echo -e "${GREEN}${BOLD}ONLINE (${#results_ok[@]})${NC}" + for line in "${results_ok[@]}"; do echo -e "${GREEN}${line}${NC}"; done +fi + +if [[ ${#results_4xx[@]} -gt 0 ]]; then + echo "" + echo -e "${YELLOW}${BOLD}ERREICHBAR / 4xx — Root-Pfad nicht definiert (${#results_4xx[@]})${NC}" + for line in "${results_4xx[@]}"; do echo -e "${YELLOW}${line}${NC}"; done +fi + +if [[ ${#results_fail[@]} -gt 0 ]]; then + echo "" + echo -e "${RED}${BOLD}NICHT ERREICHBAR / 5xx (${#results_fail[@]})${NC}" + for line in "${results_fail[@]}"; do echo -e "${RED}${line}${NC}"; done +fi + +if [[ ${#results_warn[@]} -gt 0 ]]; then + echo "" + echo -e "${YELLOW}${BOLD}TIMEOUT / KEIN DNS (${#results_warn[@]})${NC}" + for line in "${results_warn[@]}"; do echo -e "${YELLOW}${line}${NC}"; done +fi + +echo "" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo -e " ${GREEN}✅ Online: $ok${NC} ${RED}❌ Down: $fail${NC} ${YELLOW}⏱ Timeout: $warn${NC} (Gesamt: $total)" +echo ""