From a55aae6cb5e5cf0fbac1cb0158e35b6d8fd82c1e Mon Sep 17 00:00:00 2001 From: Till JS Date: Tue, 7 Apr 2026 22:59:38 +0200 Subject: [PATCH] =?UTF-8?q?chore(macmini):=20infra=20cleanup=20=E2=80=94?= =?UTF-8?q?=20compose=20env,=20blackbox=20mem,=20prometheus=20gpu=20probes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three Mac Mini infrastructure follow-ups bundled: 1. docker-compose.macmini.yml — drop ghost backend env vars from the mana-app-web service (todo, calendar, contacts, chat, storage, cards, music, nutriphi `PUBLIC_*_API_URL{,_CLIENT}` plus the memoro server URLs). The matching consumers were removed in the earlier ghost-API cleanup commits, so these env entries had been wiring nothing into the running container for several deploys. Force- recreating mana-app-web after pulling this commit will pick up the slimmer env automatically. 2. docker-compose.macmini.yml — bump `mana-mon-blackbox` mem_limit from 32m to 128m. blackbox-exporter v0.25 sits north of 32m under load and was OOM-restart-looping every ~90 seconds, which in turn made `status.mana.how` and the prometheus probe metrics stale (since the scraper was missing every other window). 3. docker/prometheus/prometheus.yml — split `blackbox-gpu` into two jobs: - `blackbox-gpu` now probes `/health` via the http_health module, because the GPU services (whisper STT, FLUX image gen, Coqui TTS) return 401/404 on `/` by design (auth or API-only). The previous http_2xx-on-`/` probe was reporting all four as down even though they answered `/health` with 200, which inflated the down count on status.mana.how. - `blackbox-gpu-root` keeps the http_2xx-on-`/` probe for Ollama, which has no `/health` endpoint but does answer 2xx on its root. Both jobs share the same blackbox-exporter relabel rewrite so the targets are routed through the exporter container, not scraped directly by VictoriaMetrics. Verified post-fix: status.mana.how reports 41/42 services up (only `gpu-video` remains down — LTX Video Gen is intentionally not deployed yet on the Windows GPU box). Co-Authored-By: Claude Opus 4.6 (1M context) --- docker-compose.macmini.yml | 28 ++++++++-------------------- docker/prometheus/prometheus.yml | 28 +++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index cf4f8432c..d853c3038 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -917,27 +917,15 @@ services: PUBLIC_MANA_AUTH_URL_CLIENT: https://auth.mana.how PUBLIC_SYNC_SERVER_URL: http://mana-sync:3010 PUBLIC_SYNC_SERVER_URL_CLIENT: https://sync.mana.how - # Backend API URLs (server-side → container, client-side → public domain) - PUBLIC_TODO_API_URL: http://todo-backend:3031 - PUBLIC_TODO_API_URL_CLIENT: https://todo-api.mana.how - PUBLIC_CALENDAR_API_URL: http://calendar-backend:3032 - PUBLIC_CALENDAR_API_URL_CLIENT: https://calendar-api.mana.how - PUBLIC_CONTACTS_API_URL: http://contacts-backend:3033 - PUBLIC_CONTACTS_API_URL_CLIENT: https://contacts-api.mana.how - PUBLIC_CHAT_API_URL: http://chat-backend:3030 - PUBLIC_CHAT_API_URL_CLIENT: https://chat-api.mana.how - PUBLIC_STORAGE_API_URL: http://storage-backend:3034 - PUBLIC_STORAGE_API_URL_CLIENT: https://storage-api.mana.how - PUBLIC_CARDS_API_URL: http://cards-backend:3036 - PUBLIC_CARDS_API_URL_CLIENT: https://cards-api.mana.how - PUBLIC_MUSIC_API_URL: http://music-backend:3037 - PUBLIC_MUSIC_API_URL_CLIENT: https://music-api.mana.how - PUBLIC_NUTRIPHI_API_URL: http://nutriphi-backend:3038 - PUBLIC_NUTRIPHI_API_URL_CLIENT: https://nutriphi-api.mana.how + # Per-app HTTP backend URLs (todo-api, calendar-api, contacts-api, + # chat-api, storage-api, cards-api, music-api, nutriphi-api, + # picture-api, presi-api, zitare-api, clock-api, context-api) and + # the standalone memoro-server URL were removed in the pre-launch + # ghost-API cleanup — every product module talks to mana-sync + # directly and the unified `memoro` module is fully local-first. + # See docs/PRE_LAUNCH_CLEANUP.md for the full rationale. PUBLIC_ULOAD_SERVER_URL: http://uload-server:3070 PUBLIC_ULOAD_SERVER_URL_CLIENT: https://uload-api.mana.how - PUBLIC_MEMORO_SERVER_URL: http://memoro-server:3015 - PUBLIC_MEMORO_SERVER_URL_CLIENT: https://memoro-api.mana.how PUBLIC_MANA_MEDIA_URL: http://mana-media:3011 PUBLIC_MANA_MEDIA_URL_CLIENT: https://media.mana.how PUBLIC_MANA_LLM_URL: http://mana-llm:3025 @@ -1422,7 +1410,7 @@ services: image: prom/blackbox-exporter:v0.25.0 container_name: mana-mon-blackbox restart: always - mem_limit: 32m + mem_limit: 128m command: ["--config.file=/etc/blackbox/blackbox.yml"] volumes: - ./docker/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index e5dbdce9a..20e05bedf 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -307,18 +307,36 @@ scrape_configs: - target_label: __address__ replacement: blackbox-exporter:9115 - # GPU Server Services + # GPU Server Services — probe /health, not / + # The GPU services (whisper STT, TTS, FLUX image gen) only return 2xx + # on /health; their root path returns 401/403/404 by design (auth or + # API-only). Ollama is the exception — its / returns 200, but it has + # no /health endpoint, so we keep it on / via a separate target. - job_name: 'blackbox-gpu' + metrics_path: /probe + params: + module: [http_health] + static_configs: + - targets: + - https://gpu-stt.mana.how/health + - https://gpu-tts.mana.how/health + - https://gpu-img.mana.how/health + - https://gpu-video.mana.how/health + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + + - job_name: 'blackbox-gpu-root' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://gpu-ollama.mana.how - - https://gpu-stt.mana.how - - https://gpu-tts.mana.how - - https://gpu-img.mana.how - - https://gpu-video.mana.how relabel_configs: - source_labels: [__address__] target_label: __param_target