From 957060ca55a33e0646de4cc748ce47c6af327d11 Mon Sep 17 00:00:00 2001 From: Till JS Date: Sat, 11 Apr 2026 16:11:01 +0200 Subject: [PATCH] feat(monitoring): add mana-geocoding + Pelias to prod compose, Prometheus, Grafana, and status.mana.how MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production deployment + observability for the self-hosted geocoding stack: **docker-compose.macmini.yml** - New mana-geocoding container (port 3018, internal-only — no traefik labels, no Cloudflare route). Uses host.docker.internal to reach the Pelias API on the host's pelias compose stack. Dockerfile added under services/mana-geocoding/ using the same Bun/Hono pattern as mana-events. **Prometheus** - New blackbox-internal job probing mana-geocoding:3018/health, the Pelias API on host.docker.internal:4000/v1/status, and Elasticsearch at host.docker.internal:9200/_cluster/health. Kept separate from blackbox-api which is reserved for public HTTPS endpoints. **status.mana.how (generate-status-page.sh)** - Include blackbox-internal in the metric query and add an "Interne Dienste" section with its own summary card, right between Infrastruktur and GPU Dienste. Summary grid goes from 4 to 5 columns with a 900px breakpoint. - friendly_name() now handles http:// URLs and rewrites container-name hosts like mana-geocoding:3018/health → "Mana Geocoding", host.docker.internal:4000 → "Pelias API", host.docker.internal:9200 → "Pelias Elasticsearch". **Grafana uptime dashboard** - Add an "Internal" series to the "Alle Dienste — Uptime-Verlauf" panel - New "Interne Dienste Status" table panel showing per-instance up/down - New "Geocoding Ø Latenz" stat panel for probe_duration_seconds Co-Authored-By: Claude Opus 4.6 (1M context) --- docker-compose.macmini.yml | 34 + docker/grafana/dashboards/uptime.json | 989 ++++++++++++++++++++++---- docker/prometheus/prometheus.yml | 21 + scripts/generate-status-page.sh | 44 +- services/mana-geocoding/Dockerfile | 16 + 5 files changed, 949 insertions(+), 155 deletions(-) create mode 100644 services/mana-geocoding/Dockerfile diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index 3dc2f5389..f650e0af2 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -362,6 +362,40 @@ services: - "traefik.http.routers.mana-events.tls=true" - "traefik.http.services.mana-events.loadbalancer.server.port=3065" + # ─── Geocoding ─────────────────────────────────────────── + # Thin Hono wrapper in front of a self-hosted Pelias stack. + # Pelias itself (elasticsearch + api + libpostal) runs from a separate + # compose file in services/mana-geocoding/pelias/ — see + # services/mana-geocoding/CLAUDE.md for the initial import procedure. + # Internal-only: no traefik labels, not exposed via Cloudflare. + mana-geocoding: + build: + context: services/mana-geocoding + dockerfile: Dockerfile + image: mana-geocoding:local + container_name: mana-geocoding + restart: always + mem_limit: 128m + # Pelias runs on host network via its own compose, so the wrapper + # reaches it via host.docker.internal (Pelias API at :4000). + extra_hosts: + - "host.docker.internal:host-gateway" + environment: + TZ: Europe/Berlin + PORT: 3018 + PELIAS_API_URL: http://host.docker.internal:4000/v1 + CORS_ORIGINS: https://mana.how,http://localhost:5173 + CACHE_MAX_ENTRIES: "5000" + CACHE_TTL_MS: "86400000" + ports: + - "3018:3018" + healthcheck: + test: ["CMD", "bun", "-e", "fetch('http://127.0.0.1:3018/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"] + interval: 120s + timeout: 10s + retries: 3 + start_period: 15s + mana-user: build: context: services/mana-user diff --git a/docker/grafana/dashboards/uptime.json b/docker/grafana/dashboards/uptime.json index 803c3b3e7..da8916042 100644 --- a/docker/grafana/dashboards/uptime.json +++ b/docker/grafana/dashboards/uptime.json @@ -6,7 +6,10 @@ "schemaVersion": 38, "version": 1, "refresh": "1m", - "time": { "from": "now-24h", "to": "now" }, + "time": { + "from": "now-24h", + "to": "now" + }, "timepicker": {}, "timezone": "browser", "templating": { @@ -27,23 +30,47 @@ "id": 1, "title": "Zusammenfassung", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + } }, { "type": "stat", "id": 2, "title": "Web Apps Online", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "yellow", "value": 10 }, - { "color": "green", "value": 18 } + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "green", + "value": 18 + } ] }, "unit": "short", @@ -56,12 +83,19 @@ "graphMode": "area", "justifyMode": "center", "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, "textMode": "auto" }, "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "sum(probe_success{job=\"blackbox-web\"})", "refId": "A", "legendFormat": "Online" @@ -72,17 +106,36 @@ "type": "stat", "id": 3, "title": "APIs Online", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "yellow", "value": 7 }, - { "color": "green", "value": 9 } + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 7 + }, + { + "color": "green", + "value": 9 + } ] }, "unit": "short", @@ -95,12 +148,19 @@ "graphMode": "area", "justifyMode": "center", "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, "textMode": "auto" }, "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "sum(probe_success{job=\"blackbox-api\"})", "refId": "A", "legendFormat": "Online" @@ -111,17 +171,36 @@ "type": "stat", "id": 4, "title": "Infra Online", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "yellow", "value": 4 }, - { "color": "green", "value": 5 } + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 4 + }, + { + "color": "green", + "value": 5 + } ] }, "unit": "short", @@ -134,12 +213,19 @@ "graphMode": "area", "justifyMode": "center", "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, "textMode": "auto" }, "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "sum(probe_success{job=\"blackbox-infra\"})", "refId": "A", "legendFormat": "Online" @@ -150,17 +236,36 @@ "type": "stat", "id": 5, "title": "GPU Services Online", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "yellow", "value": 2 }, - { "color": "green", "value": 4 } + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 2 + }, + { + "color": "green", + "value": 4 + } ] }, "unit": "short", @@ -173,12 +278,19 @@ "graphMode": "area", "justifyMode": "center", "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, "textMode": "auto" }, "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "sum(probe_success{job=\"blackbox-gpu\"})", "refId": "A", "legendFormat": "Online" @@ -189,17 +301,36 @@ "type": "stat", "id": 6, "title": "Ø Antwortzeit (Web)", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 2 }, - { "color": "red", "value": 5 } + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 2 + }, + { + "color": "red", + "value": 5 + } ] }, "unit": "s", @@ -212,12 +343,19 @@ "graphMode": "area", "justifyMode": "center", "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, "textMode": "auto" }, "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "avg(probe_duration_seconds{job=\"blackbox-web\"})", "refId": "A", "legendFormat": "Ø ms" @@ -229,14 +367,27 @@ "id": 10, "title": "Web Apps", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 } + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + } }, { "type": "table", "id": 11, "title": "Web App Status", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 14, "w": 12, "x": 0, "y": 6 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 6 + }, "fieldConfig": { "defaults": { "custom": { @@ -247,53 +398,106 @@ { "type": "value", "options": { - "0": { "text": "DOWN", "color": "red" }, - "1": { "text": "UP", "color": "green" } + "0": { + "text": "DOWN", + "color": "red" + }, + "1": { + "text": "UP", + "color": "green" + } } } ], "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } ] } }, "overrides": [ { - "matcher": { "id": "byName", "options": "instance" }, + "matcher": { + "id": "byName", + "options": "instance" + }, "properties": [ - { "id": "displayName", "value": "URL" }, - { "id": "custom.width", "value": 260 } + { + "id": "displayName", + "value": "URL" + }, + { + "id": "custom.width", + "value": 260 + } ] }, { - "matcher": { "id": "byName", "options": "Value" }, + "matcher": { + "id": "byName", + "options": "Value" + }, "properties": [ - { "id": "displayName", "value": "Status" }, - { "id": "custom.width", "value": 80 } + { + "id": "displayName", + "value": "Status" + }, + { + "id": "custom.width", + "value": 80 + } ] } ] }, "options": { - "sortBy": [{ "displayName": "Status", "desc": false }], - "footer": { "show": false } + "sortBy": [ + { + "displayName": "Status", + "desc": false + } + ], + "footer": { + "show": false + } }, "transformations": [ - { "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } }, + { + "id": "labelsToFields", + "options": { + "mode": "columns", + "keepLabels": ["instance"] + } + }, { "id": "organize", "options": { - "excludeByName": { "Time": true, "job": true, "__name__": true }, - "indexByName": { "instance": 0, "Value": 1 } + "excludeByName": { + "Time": true, + "job": true, + "__name__": true + }, + "indexByName": { + "instance": 0, + "Value": 1 + } } } ], "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "probe_success{job=\"blackbox-web\"}", "instant": true, "refId": "A", @@ -305,18 +509,37 @@ "type": "timeseries", "id": 12, "title": "Web App Antwortzeiten", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 14, "w": 12, "x": 12, "y": 6 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 12, + "y": 6 + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 1, "fillOpacity": 10 }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 1, + "fillOpacity": 10 + }, "unit": "s", "thresholds": { "mode": "absolute", "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 5 } + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 5 + } ] } }, @@ -328,11 +551,16 @@ "placement": "bottom", "calcs": ["lastNotNull", "mean"] }, - "tooltip": { "mode": "multi" } + "tooltip": { + "mode": "multi" + } }, "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "probe_duration_seconds{job=\"blackbox-web\"}", "refId": "A", "legendFormat": "{{instance}}" @@ -344,68 +572,136 @@ "id": 20, "title": "API Health Endpoints", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 } + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + } }, { "type": "table", "id": 21, "title": "API Status", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 10, "w": 12, "x": 0, "y": 21 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 21 + }, "fieldConfig": { "defaults": { - "custom": { "align": "left", "displayMode": "color-background" }, + "custom": { + "align": "left", + "displayMode": "color-background" + }, "mappings": [ { "type": "value", "options": { - "0": { "text": "DOWN", "color": "red" }, - "1": { "text": "UP", "color": "green" } + "0": { + "text": "DOWN", + "color": "red" + }, + "1": { + "text": "UP", + "color": "green" + } } } ], "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } ] } }, "overrides": [ { - "matcher": { "id": "byName", "options": "instance" }, + "matcher": { + "id": "byName", + "options": "instance" + }, "properties": [ - { "id": "displayName", "value": "Endpoint" }, - { "id": "custom.width", "value": 300 } + { + "id": "displayName", + "value": "Endpoint" + }, + { + "id": "custom.width", + "value": 300 + } ] }, { - "matcher": { "id": "byName", "options": "Value" }, + "matcher": { + "id": "byName", + "options": "Value" + }, "properties": [ - { "id": "displayName", "value": "Status" }, - { "id": "custom.width", "value": 80 } + { + "id": "displayName", + "value": "Status" + }, + { + "id": "custom.width", + "value": 80 + } ] } ] }, "options": { - "sortBy": [{ "displayName": "Status", "desc": false }], - "footer": { "show": false } + "sortBy": [ + { + "displayName": "Status", + "desc": false + } + ], + "footer": { + "show": false + } }, "transformations": [ - { "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } }, + { + "id": "labelsToFields", + "options": { + "mode": "columns", + "keepLabels": ["instance"] + } + }, { "id": "organize", "options": { - "excludeByName": { "Time": true, "job": true }, - "indexByName": { "instance": 0, "Value": 1 } + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "instance": 0, + "Value": 1 + } } } ], "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "probe_success{job=\"blackbox-api\"}", "instant": true, "refId": "A", @@ -417,23 +713,52 @@ "type": "timeseries", "id": 22, "title": "API Uptime-Verlauf (24h)", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 10, "w": 12, "x": 12, "y": 21 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 21 + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 2, "fillOpacity": 20 }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 2, + "fillOpacity": 20 + }, "unit": "short", "min": 0, "max": 1, "mappings": [ - { "type": "value", "options": { "0": { "text": "DOWN" }, "1": { "text": "UP" } } } + { + "type": "value", + "options": { + "0": { + "text": "DOWN" + }, + "1": { + "text": "UP" + } + } + } ], "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } ] } }, @@ -445,11 +770,16 @@ "placement": "bottom", "calcs": ["lastNotNull", "mean"] }, - "tooltip": { "mode": "multi" } + "tooltip": { + "mode": "multi" + } }, "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "probe_success{job=\"blackbox-api\"}", "refId": "A", "legendFormat": "{{instance}}" @@ -461,65 +791,132 @@ "id": 30, "title": "Infrastruktur & GPU", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 } + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + } }, { "type": "table", "id": 31, "title": "Infra-Dienste Status", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 8, "w": 8, "x": 0, "y": 32 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 32 + }, "fieldConfig": { "defaults": { - "custom": { "align": "left", "displayMode": "color-background" }, + "custom": { + "align": "left", + "displayMode": "color-background" + }, "mappings": [ { "type": "value", "options": { - "0": { "text": "DOWN", "color": "red" }, - "1": { "text": "UP", "color": "green" } + "0": { + "text": "DOWN", + "color": "red" + }, + "1": { + "text": "UP", + "color": "green" + } } } ], "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } ] } }, "overrides": [ { - "matcher": { "id": "byName", "options": "instance" }, + "matcher": { + "id": "byName", + "options": "instance" + }, "properties": [ - { "id": "displayName", "value": "Dienst" }, - { "id": "custom.width", "value": 220 } + { + "id": "displayName", + "value": "Dienst" + }, + { + "id": "custom.width", + "value": 220 + } ] }, { - "matcher": { "id": "byName", "options": "Value" }, - "properties": [{ "id": "displayName", "value": "Status" }] + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Status" + } + ] } ] }, "options": { - "sortBy": [{ "displayName": "Status", "desc": false }], - "footer": { "show": false } + "sortBy": [ + { + "displayName": "Status", + "desc": false + } + ], + "footer": { + "show": false + } }, "transformations": [ - { "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } }, + { + "id": "labelsToFields", + "options": { + "mode": "columns", + "keepLabels": ["instance"] + } + }, { "id": "organize", "options": { - "excludeByName": { "Time": true, "job": true }, - "indexByName": { "instance": 0, "Value": 1 } + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "instance": 0, + "Value": 1 + } } } ], "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "probe_success{job=\"blackbox-infra\"}", "instant": true, "refId": "A", @@ -531,59 +928,121 @@ "type": "table", "id": 32, "title": "GPU Server Status", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 8, "w": 8, "x": 8, "y": 32 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 32 + }, "fieldConfig": { "defaults": { - "custom": { "align": "left", "displayMode": "color-background" }, + "custom": { + "align": "left", + "displayMode": "color-background" + }, "mappings": [ { "type": "value", "options": { - "0": { "text": "DOWN", "color": "red" }, - "1": { "text": "UP", "color": "green" } + "0": { + "text": "DOWN", + "color": "red" + }, + "1": { + "text": "UP", + "color": "green" + } } } ], "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } ] } }, "overrides": [ { - "matcher": { "id": "byName", "options": "instance" }, + "matcher": { + "id": "byName", + "options": "instance" + }, "properties": [ - { "id": "displayName", "value": "Dienst" }, - { "id": "custom.width", "value": 220 } + { + "id": "displayName", + "value": "Dienst" + }, + { + "id": "custom.width", + "value": 220 + } ] }, { - "matcher": { "id": "byName", "options": "Value" }, - "properties": [{ "id": "displayName", "value": "Status" }] + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Status" + } + ] } ] }, "options": { - "sortBy": [{ "displayName": "Status", "desc": false }], - "footer": { "show": false } + "sortBy": [ + { + "displayName": "Status", + "desc": false + } + ], + "footer": { + "show": false + } }, "transformations": [ - { "id": "labelsToFields", "options": { "mode": "columns", "keepLabels": ["instance"] } }, + { + "id": "labelsToFields", + "options": { + "mode": "columns", + "keepLabels": ["instance"] + } + }, { "id": "organize", "options": { - "excludeByName": { "Time": true, "job": true }, - "indexByName": { "instance": 0, "Value": 1 } + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "instance": 0, + "Value": 1 + } } } ], "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "probe_success{job=\"blackbox-gpu\"}", "instant": true, "refId": "A", @@ -595,55 +1054,291 @@ "type": "timeseries", "id": 33, "title": "Alle Dienste — Uptime-Verlauf", - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "gridPos": { "h": 8, "w": 8, "x": 16, "y": 32 }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 32 + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 2, "fillOpacity": 10 }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 2, + "fillOpacity": 10 + }, "unit": "short", "min": 0, "max": 1, "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } ] } }, "overrides": [] }, "options": { - "legend": { "displayMode": "list", "placement": "bottom" }, - "tooltip": { "mode": "multi" } + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } }, "targets": [ { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "avg(probe_success{job=\"blackbox-web\"})", "refId": "A", "legendFormat": "Web Apps" }, { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "avg(probe_success{job=\"blackbox-api\"})", "refId": "B", "legendFormat": "APIs" }, { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "avg(probe_success{job=\"blackbox-infra\"})", "refId": "C", "legendFormat": "Infra" }, { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "avg(probe_success{job=\"blackbox-internal\"})", + "refId": "E", + "legendFormat": "Internal" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "expr": "avg(probe_success{job=\"blackbox-gpu\"})", "refId": "D", "legendFormat": "GPU" } ] + }, + { + "type": "table", + "id": 34, + "title": "Interne Dienste Status", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 40 + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "displayMode": "color-background" + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "DOWN", + "color": "red" + }, + "1": { + "text": "UP", + "color": "green" + } + } + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "displayName", + "value": "Dienst" + }, + { + "id": "custom.width", + "value": 320 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Status" + } + ] + } + ] + }, + "options": { + "sortBy": [ + { + "displayName": "Status", + "desc": false + } + ], + "footer": { + "show": false + } + }, + "transformations": [ + { + "id": "labelsToFields", + "options": { + "mode": "columns", + "keepLabels": ["instance"] + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "instance": 0, + "Value": 1 + } + } + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "probe_success{job=\"blackbox-internal\"}", + "instant": true, + "refId": "A", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "stat", + "id": 35, + "title": "Geocoding Ø Latenz", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 40 + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "s", + "mappings": [] + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "avg(probe_duration_seconds{instance=~\".*mana-geocoding.*\"})", + "refId": "A", + "legendFormat": "Ø" + } + ] } ] } diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index dbb9e6a3a..68dd9aa12 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -283,6 +283,27 @@ scrape_configs: - target_label: __address__ replacement: blackbox-exporter:9115 + # Internal-only services (not exposed via Cloudflare). + # Probed over the Docker network so the blackbox exporter reaches + # them by container name. + - job_name: 'blackbox-internal' + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: + - http://mana-geocoding:3018/health + # Pelias stack runs on host network, reached via host gateway + - http://host.docker.internal:4000/v1/status + - http://host.docker.internal:9200/_cluster/health + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + # Infrastructure & Monitoring Tools - job_name: 'blackbox-infra' metrics_path: /probe diff --git a/scripts/generate-status-page.sh b/scripts/generate-status-page.sh index 540dda31a..929fd1cde 100755 --- a/scripts/generate-status-page.sh +++ b/scripts/generate-status-page.sh @@ -25,8 +25,8 @@ fetch_metric() { 2>/dev/null || echo '{"status":"error","data":{"result":[]}}' } -SUCCESS_JSON="$(fetch_metric 'probe_success{job=~"blackbox-web|blackbox-api|blackbox-infra|blackbox-gpu"}')" -DURATION_JSON="$(fetch_metric 'probe_duration_seconds{job=~"blackbox-web|blackbox-api|blackbox-infra|blackbox-gpu"}')" +SUCCESS_JSON="$(fetch_metric 'probe_success{job=~"blackbox-web|blackbox-api|blackbox-infra|blackbox-internal|blackbox-gpu"}')" +DURATION_JSON="$(fetch_metric 'probe_duration_seconds{job=~"blackbox-web|blackbox-api|blackbox-infra|blackbox-internal|blackbox-gpu"}')" # ── Hilfsfunktionen ───────────────────────────────────────────────────────── @@ -59,10 +59,20 @@ get_instances() { # Freundlicher Name aus URL friendly_name() { url="$1" - # Entferne https:// + # Entferne https:// oder http:// name="${url#https://}" - # Route-basierte URLs: mana.how/chat → Chat + name="${name#http://}" + # Interne Services (Docker-Netz): mana-geocoding:3018/health → Mana Geocoding case "$name" in + mana-geocoding:*) + name="Mana Geocoding" + ;; + host.docker.internal:4000*) + name="Pelias API" + ;; + host.docker.internal:9200*) + name="Pelias Elasticsearch" + ;; mana.how/*) name="${name#mana.how/}" ;; @@ -71,8 +81,10 @@ friendly_name() { name="${name%.mana.how}" ;; esac - # Entferne /health suffix + # Entferne /health, /_cluster/health, /v1/status suffixe name="${name%/health}" + name="${name%/_cluster/health}" + name="${name%/v1/status}" # mana.how (ohne Route) → Mana [ "$name" = "mana.how" ] && name="Mana" # Erster Buchstabe groß (POSIX-kompatibel) @@ -131,11 +143,12 @@ render_rows() { web_up="$(count_up blackbox-web)"; web_total="$(count_total blackbox-web)" api_up="$(count_up blackbox-api)"; api_total="$(count_total blackbox-api)" +internal_up="$(count_up blackbox-internal)"; internal_total="$(count_total blackbox-internal)" infra_up="$(count_up blackbox-infra)"; infra_total="$(count_total blackbox-infra)" gpu_up="$(count_up blackbox-gpu)"; gpu_total="$(count_total blackbox-gpu)" -total_up=$(( web_up + api_up + infra_up + gpu_up )) -total_all=$(( web_total + api_total + infra_total + gpu_total )) +total_up=$(( web_up + api_up + internal_up + infra_up + gpu_up )) +total_all=$(( web_total + api_total + internal_total + infra_total + gpu_total )) total_down=$(( total_all - total_up )) if [ "$total_down" -eq 0 ] && [ "$total_all" -gt 0 ]; then @@ -276,10 +289,11 @@ cat > "${OUTPUT}.tmp" << HTMLEOF /* ── Summary Row ── */ .summary { display: grid; - grid-template-columns: repeat(4, 1fr); + grid-template-columns: repeat(5, 1fr); gap: 12px; margin-bottom: 32px; } + @media (max-width: 900px) { .summary { grid-template-columns: repeat(3, 1fr); } } @media (max-width: 600px) { .summary { grid-template-columns: repeat(2, 1fr); } } .summary-card { @@ -385,6 +399,10 @@ cat > "${OUTPUT}.tmp" << HTMLEOF
${api_up}/${api_total}
API Backends
+
+
${internal_up}/${internal_total}
+
Interne
+
${infra_up}/${infra_total}
Infrastruktur
@@ -415,6 +433,16 @@ $(render_rows blackbox-api)
+
+
+

Interne Dienste

+ ${internal_up} von ${internal_total} online +
+ +$(render_rows blackbox-internal) +
+
+

Infrastruktur

diff --git a/services/mana-geocoding/Dockerfile b/services/mana-geocoding/Dockerfile new file mode 100644 index 000000000..e23474aa7 --- /dev/null +++ b/services/mana-geocoding/Dockerfile @@ -0,0 +1,16 @@ +FROM oven/bun:1 AS production + +WORKDIR /app + +COPY package.json bun.lock* ./ +RUN bun install --frozen-lockfile 2>/dev/null || bun install + +COPY src ./src +COPY tsconfig.json ./ + +EXPOSE 3018 + +HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \ + CMD bun -e "fetch('http://localhost:3018/health').then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))" + +CMD ["bun", "run", "src/index.ts"]