infra(gpu-box): commit GPU-Box compose to repo + Phase 2e docs

The GPU-Box stack has been carrying real production workload since
Phase 2c (monitoring) but only existed as a /srv/mana/docker-compose.gpu-box.yml
on the box itself. If the WSL filesystem dies, none of it is
reproducible. Bring the file into infrastructure/ as the source of
truth (live file on the box must be kept synchronous; manual rsync
for now since there's no CD into the GPU box).

Plus:
- infrastructure/.env.gpu-box.example as the secrets template
- infrastructure/README.md describing what runs there + how the
  Cloudflare-tunnel ingress is API-managed (not config.yml)
- .gitignore for the live infrastructure/.env.gpu-box copy
- MAC_MINI_SERVER.md status-page section now points at the GPU-Box
  setup instead of the long-stopped Mini container
- PLAN_OPTION_C.md: Phase 2e row + GPU-Box service tree update

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-05-07 13:28:49 +02:00
parent e3cca9e271
commit d8a35afd99
6 changed files with 576 additions and 9 deletions

View file

@ -0,0 +1,445 @@
# Mana GPU-Box stack — Workload-Split mit dem Mac Mini.
# Phase 2c (2026-05-06): Metrics-Stack komplett auf GPU-Box.
# Production-Hot-Path bleibt unverändert auf dem Mini.
#
# Architektur:
# - Apps hier (Grafana, Forgejo, Umami, Glitchtip-future) lesen Postgres
# auf 192.168.178.131:5432 als SoT.
# - VictoriaMetrics scrapt Mac-Mini-Services via 192.168.178.131:<port>
# (siehe monitoring/prometheus/prometheus.yml) und GPU-Box-eigene
# node-exporter + cadvisor lokal.
# - Loki empfängt Logs vom Mini-Promtail (Push-Modus, Mini schickt
# zu http://192.168.178.11:3100) UND vom GPU-Box-eigenen Promtail.
# - Cutover via Cloudflare-Tunnel `mana-gpu-server`.
services:
# ============================================
# Phase 2a — Grafana (UI)
# ============================================
grafana:
image: grafana/grafana:10.4.1
container_name: mana-mon-grafana
restart: unless-stopped
ports:
- '8000:3000'
environment:
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: ${GF_ADMIN_PASSWORD}
GF_SERVER_ROOT_URL: https://grafana.mana.how
GF_INSTALL_PLUGINS: yesoreyeram-infinity-datasource
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
GLITCHTIP_DB_PASSWORD: ${GLITCHTIP_DB_PASSWORD}
volumes:
- mana-grafana-data:/var/lib/grafana
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
healthcheck:
test: ['CMD-SHELL', 'wget -q -O- http://localhost:3000/api/health || exit 1']
interval: 30s
timeout: 5s
retries: 3
# ============================================
# Phase 2b — Forgejo + Umami
# ============================================
forgejo:
image: codeberg.org/forgejo/forgejo:11
container_name: mana-core-forgejo
restart: unless-stopped
ports:
- '3041:3000'
- '2222:22'
environment:
USER_UID: 1000
USER_GID: 1000
FORGEJO__database__DB_TYPE: postgres
FORGEJO__database__HOST: 192.168.178.131:5432
FORGEJO__database__NAME: forgejo
FORGEJO__database__USER: postgres
FORGEJO__database__PASSWD: ${POSTGRES_PASSWORD}
FORGEJO__server__DOMAIN: git.mana.how
FORGEJO__server__SSH_DOMAIN: git.mana.how
FORGEJO__server__ROOT_URL: https://git.mana.how/
FORGEJO__server__HTTP_PORT: 3000
FORGEJO__server__SSH_PORT: 2222
FORGEJO__server__LFS_START_SERVER: 'true'
FORGEJO__service__DISABLE_REGISTRATION: 'true'
FORGEJO__service__REQUIRE_SIGNIN_VIEW: 'false'
FORGEJO__actions__ENABLED: 'true'
FORGEJO__actions__DEFAULT_ACTIONS_URL: https://code.forgejo.org
FORGEJO__packages__ENABLED: 'true'
FORGEJO__ui__DEFAULT_THEME: forgejo-dark
FORGEJO__ui__SHOW_USER_EMAIL: 'false'
FORGEJO__mailer__ENABLED: 'false'
volumes:
- ./forgejo-data:/data
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://localhost:3000/api/v1/version']
interval: 120s
timeout: 10s
retries: 3
start_period: 60s
umami:
image: ghcr.io/umami-software/umami:postgresql-v2.18.0
container_name: mana-mon-umami
restart: unless-stopped
ports:
- '8010:3000'
environment:
DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD}@192.168.178.131:5432/umami
DATABASE_TYPE: postgresql
APP_SECRET: ${UMAMI_APP_SECRET}
DISABLE_TELEMETRY: '1'
healthcheck:
test: ['CMD-SHELL', 'wget -q -O- http://127.0.0.1:3000/api/heartbeat || exit 1']
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
# ============================================
# Phase 2c — Metrics-Stack
# ============================================
victoriametrics:
image: victoriametrics/victoria-metrics:v1.99.0
container_name: mana-mon-victoria
extra_hosts:
- 'host.docker.internal:host-gateway'
restart: unless-stopped
entrypoint:
- /victoria-metrics-prod
- -storageDataPath=/storage
- -retentionPeriod=2y
- -httpListenAddr=:9090
- -promscrape.config=/etc/prometheus/prometheus.yml
- -promscrape.config.strictParse=false
- -selfScrapeInterval=15s
- -search.latencyOffset=0s
volumes:
- ./monitoring/prometheus:/etc/prometheus:ro
- victoriametrics-data:/storage
ports:
- '9090:9090'
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9090/health']
interval: 60s
timeout: 10s
retries: 3
start_period: 15s
loki:
image: grafana/loki:3.0.0
container_name: mana-mon-loki
restart: unless-stopped
entrypoint: ['sh', '-c', 'mkdir -p /etc/loki && cp /mnt/loki-config/*.yaml /etc/loki/ 2>/dev/null; exec /usr/bin/loki -config.file=/etc/loki/local-config.yaml']
volumes:
- ./monitoring/loki:/mnt/loki-config:ro
- loki-data:/loki
ports:
- '3100:3100'
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:3100/ready']
interval: 60s
timeout: 10s
retries: 3
start_period: 15s
pushgateway:
image: prom/pushgateway:v1.7.0
container_name: mana-mon-pushgateway
restart: unless-stopped
ports:
- '9091:9091'
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9091/-/healthy']
interval: 60s
timeout: 10s
retries: 3
start_period: 20s
blackbox-exporter:
image: prom/blackbox-exporter:v0.25.0
container_name: mana-mon-blackbox
restart: unless-stopped
dns:
- 1.1.1.1
- 8.8.8.8
command: ['--config.file=/etc/blackbox/blackbox.yml']
volumes:
- ./monitoring/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro
ports:
- '9115:9115'
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9115/']
interval: 60s
timeout: 10s
retries: 3
vmalert:
image: victoriametrics/vmalert:v1.99.0
container_name: mana-mon-vmalert
restart: unless-stopped
depends_on:
victoriametrics:
condition: service_healthy
alertmanager:
condition: service_started
entrypoint:
- /vmalert-prod
- -datasource.url=http://victoriametrics:9090
- -notifier.url=http://alertmanager:9093
- -remoteWrite.url=http://victoriametrics:9090
- -remoteRead.url=http://victoriametrics:9090
- -rule=/etc/alerts/alerts.yml
- -evaluationInterval=30s
- -httpListenAddr=:8880
volumes:
- ./monitoring/prometheus:/etc/alerts:ro
ports:
- '8880:8880'
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:8880/health']
interval: 60s
timeout: 10s
retries: 3
start_period: 25s
alertmanager:
image: prom/alertmanager:v0.27.0
container_name: mana-mon-alertmanager
restart: unless-stopped
depends_on:
alert-notifier:
condition: service_started
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --web.listen-address=:9093
volumes:
- ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager-data:/alertmanager
ports:
- '9093:9093'
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9093/-/healthy']
interval: 60s
timeout: 10s
retries: 3
start_period: 25s
alert-notifier:
build:
context: ./monitoring/alert-notifier
dockerfile: Dockerfile
image: alert-notifier:gpu-box
container_name: mana-mon-alert-notifier
restart: unless-stopped
environment:
PORT: 8080
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID}
NTFY_TOPIC: ${NTFY_TOPIC:-}
ports:
- '9095:8080'
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:8080/health']
interval: 60s
timeout: 5s
retries: 3
start_period: 25s
# GPU-Box self-monitoring (each box runs its own node-exporter + cadvisor)
gpu-node-exporter:
image: prom/node-exporter:v1.7.0
container_name: mana-mon-gpu-node-exporter
restart: unless-stopped
command:
- --collector.disable-defaults
- --collector.cpu
- --collector.meminfo
- --collector.loadavg
- --collector.filesystem
- --collector.netdev
- --collector.time
- --collector.uname
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9100/metrics']
interval: 60s
timeout: 10s
retries: 3
start_period: 20s
gpu-cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
container_name: mana-mon-gpu-cadvisor
restart: unless-stopped
privileged: true
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:8080/healthz']
interval: 60s
timeout: 10s
retries: 3
start_period: 20s
gpu-promtail:
image: grafana/promtail:3.0.0
container_name: mana-mon-gpu-promtail
restart: unless-stopped
command: ['-config.file=/etc/promtail/config.yaml', '-config.expand-env=true']
volumes:
- ./monitoring/promtail-gpu:/etc/promtail:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
depends_on:
loki:
condition: service_started
healthcheck:
test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9080/ready']
interval: 60s
timeout: 10s
retries: 3
start_period: 15s
# ============================================
# Phase 2d — Glitchtip mit dedizierter Postgres + Redis (2026-05-06)
# Mini-Postgres hatte Partition-Creation-Permission-Issues
# (macOS-Docker-Storage-Quirk auf externer SSD), daher eigener Stack hier.
# ============================================
glitchtip-postgres:
image: postgres:16-alpine
container_name: mana-mon-glitchtip-postgres
restart: unless-stopped
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: ${GLITCHTIP_DB_PASSWORD}
POSTGRES_DB: glitchtip
volumes:
- glitchtip-pg-data:/var/lib/postgresql/data
healthcheck:
test: ['CMD-SHELL', 'pg_isready -U postgres -d glitchtip']
interval: 30s
timeout: 5s
retries: 5
start_period: 15s
glitchtip-redis:
image: redis:7-alpine
container_name: mana-mon-glitchtip-redis
restart: unless-stopped
command: ['redis-server', '--maxmemory', '128mb', '--maxmemory-policy', 'allkeys-lru']
healthcheck:
test: ['CMD', 'redis-cli', 'ping']
interval: 30s
timeout: 3s
retries: 3
start_period: 5s
glitchtip:
image: glitchtip/glitchtip:latest
container_name: mana-mon-glitchtip
restart: unless-stopped
depends_on:
glitchtip-postgres:
condition: service_healthy
glitchtip-redis:
condition: service_healthy
environment:
DATABASE_URL: postgres://postgres:${GLITCHTIP_DB_PASSWORD}@glitchtip-postgres:5432/glitchtip
REDIS_URL: redis://glitchtip-redis:6379/0
SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
DEFAULT_FROM_EMAIL: glitchtip@mana.how
ENABLE_USER_REGISTRATION: 'true'
PORT: '8020'
GLITCHTIP_DOMAIN: https://glitchtip.mana.how
CELERY_WORKER_AUTOSCALE: '1,3'
# Heroku-style trigger so start.sh runs ./manage.py migrate at boot:
DYNO: web.1
ports:
- '8020:8020'
healthcheck:
test: ['CMD', 'python3', '-c', 'import urllib.request; urllib.request.urlopen("http://localhost:8020/_health/")']
interval: 60s
timeout: 10s
retries: 3
start_period: 90s
glitchtip-worker:
image: glitchtip/glitchtip:latest
container_name: mana-mon-glitchtip-worker
restart: unless-stopped
depends_on:
glitchtip:
condition: service_started
command: ['./bin/run-celery-with-beat.sh']
environment:
DATABASE_URL: postgres://postgres:${GLITCHTIP_DB_PASSWORD}@glitchtip-postgres:5432/glitchtip
REDIS_URL: redis://glitchtip-redis:6379/0
SECRET_KEY: ${GLITCHTIP_SECRET_KEY}
GLITCHTIP_DOMAIN: https://glitchtip.mana.how
CELERY_WORKER_AUTOSCALE: '1,3'
# ============================================
# Phase 2e — Status-Page (2026-05-07): generator + nginx auf GPU-Box.
# Vorher auf Mini, jetzt direkt neben VM/Loki — keine vm.mana.how-
# Public-API-Exposure mehr nötig, kein Cloudflare-Round-Trip pro Query.
# ============================================
status-page-gen:
image: alpine:3.20
container_name: mana-mon-status-gen
restart: unless-stopped
depends_on:
victoriametrics:
condition: service_healthy
environment:
VICTORIAMETRICS_URL: http://victoriametrics:9090
OUTPUT_FILE: /output/index.html
volumes:
# Bind from sparse clone (auto-pulled hourly via systemd timer
# mana-source-pull.timer; restart hier nicht nötig da Container
# cp /generate.sh /tmp/ beim ersten Tick liest)
- /srv/mana/source/scripts/generate-status-page.sh:/generate.sh:ro
- /srv/mana/source/packages/shared-branding/src/mana-apps.ts:/mana-apps.ts:ro
- status-output:/output
command:
- sh
- -c
- |
apk add --no-cache curl jq || { echo "apk add fehlgeschlagen"; sleep 10; exit 1; }
mkdir -p /output
while true; do
cp /generate.sh /tmp/generate.sh
sh /tmp/generate.sh
sleep 60
done
status-nginx:
image: nginx:alpine
container_name: mana-mon-status-nginx
restart: unless-stopped
depends_on:
status-page-gen:
condition: service_started
ports:
- '8090:80'
volumes:
- status-output:/usr/share/nginx/html:ro
healthcheck:
test: ['CMD', 'wget', '-q', '-O-', '-T', '3', 'http://127.0.0.1/status.json']
interval: 60s
timeout: 5s
retries: 3
start_period: 30s
volumes:
glitchtip-pg-data:
status-output:
mana-grafana-data:
victoriametrics-data:
loki-data:
alertmanager-data: