From 384be93274163200dd56602f4baeace26b87a3e8 Mon Sep 17 00:00:00 2001 From: Till JS Date: Thu, 7 May 2026 15:29:04 +0200 Subject: [PATCH] feat(gpu-box): healthchecks for glitchtip-worker, gpu-promtail, status-gen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three containers were running with no healthcheck — Docker showed them as 'none', so an actual crash inside the container would only surface once the process itself exited (and got restarted by restart-policy). Added container-internal probes that don't depend on tools the image doesn't ship: - glitchtip-worker: bash + /dev/tcp/glitchtip-redis/6379 — confirms the Celery broker is reachable. Bare-metal probe, no extra deps. - gpu-promtail: bash + /dev/tcp/loki/3100 — confirms the loki sink the worker is shipping to is reachable. Replaces the wget-based check that errored 'executable file not found' on every tick. - status-page-gen: stat + date — confirms /output/status.json was rewritten in the last 3 min (script writes it every 60s). Catches the case where the apk-install loop wedges or the generator silently dies. CMD-SHELL is /bin/sh which is dash on Debian-based images and dash doesn't support /dev/tcp — used CMD form with explicit bash for the two TCP probes. photon stays without a healthcheck — pre-existing user container, not in this compose file. Adding it would require a recreate which loses the warm OSM cache. After rollout: 17/20 GPU-Box containers healthy + 3 'none' (status-nginx, glitchtip-redis, gpu-node-exporter — all standard upstream images without built-in /health endpoints; their service is checked indirectly via downstream consumers' healthchecks). Co-Authored-By: Claude Opus 4.7 (1M context) --- infrastructure/docker-compose.gpu-box.yml | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/infrastructure/docker-compose.gpu-box.yml b/infrastructure/docker-compose.gpu-box.yml index a06e45c97..1c8185108 100644 --- a/infrastructure/docker-compose.gpu-box.yml +++ b/infrastructure/docker-compose.gpu-box.yml @@ -299,7 +299,12 @@ services: depends_on: loki: condition: service_started - # healthcheck disabled: promtail image has no curl/wget/nc; restart policy handles crashes + healthcheck: + test: ['CMD', 'bash', '-c', 'exec 3<>/dev/tcp/loki/3100'] + interval: 60s + timeout: 5s + retries: 3 + start_period: 15s # ============================================ # Phase 2d — Glitchtip mit dedizierter Postgres + Redis (2026-05-06) @@ -378,6 +383,12 @@ services: SECRET_KEY: ${GLITCHTIP_SECRET_KEY} GLITCHTIP_DOMAIN: https://glitchtip.mana.how CELERY_WORKER_AUTOSCALE: '1,3' + healthcheck: + test: ['CMD', 'bash', '-c', 'exec 3<>/dev/tcp/glitchtip-redis/6379'] + interval: 60s + timeout: 5s + retries: 3 + start_period: 30s # ============================================ # Phase 2e — Status-Page (2026-05-07): generator + nginx auf GPU-Box. @@ -412,6 +423,12 @@ services: sh /tmp/generate.sh sleep 60 done + healthcheck: + test: ['CMD-SHELL', '[ -f /output/status.json ] && [ $$(( $$(date +%s) - $$(stat -c %Y /output/status.json) )) -lt 180 ]'] + interval: 90s + timeout: 5s + retries: 2 + start_period: 60s status-nginx: image: nginx:alpine