# Mana GPU-Box stack — Workload-Split mit dem Mac Mini. # Phase 2c (2026-05-06): Metrics-Stack komplett auf GPU-Box. # Production-Hot-Path bleibt unverändert auf dem Mini. # # Architektur: # - Apps hier (Grafana, Forgejo, Umami, Glitchtip-future) lesen Postgres # auf 192.168.178.131:5432 als SoT. # - VictoriaMetrics scrapt Mac-Mini-Services via 192.168.178.131: # (siehe monitoring/prometheus/prometheus.yml) und GPU-Box-eigene # node-exporter + cadvisor lokal. # - Loki empfängt Logs vom Mini-Promtail (Push-Modus, Mini schickt # zu http://192.168.178.11:3100) UND vom GPU-Box-eigenen Promtail. # - Cutover via Cloudflare-Tunnel `mana-gpu-server`. services: # ============================================ # Phase 2a — Grafana (UI) # ============================================ grafana: image: grafana/grafana:10.4.1 container_name: mana-mon-grafana restart: unless-stopped ports: - '8000:3000' environment: GF_SECURITY_ADMIN_USER: admin GF_SECURITY_ADMIN_PASSWORD: ${GF_ADMIN_PASSWORD} GF_SERVER_ROOT_URL: https://grafana.mana.how GF_INSTALL_PLUGINS: yesoreyeram-infinity-datasource POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} GLITCHTIP_DB_PASSWORD: ${GLITCHTIP_DB_PASSWORD} volumes: - mana-grafana-data:/var/lib/grafana - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro healthcheck: test: ['CMD-SHELL', 'wget -q -O- http://localhost:3000/api/health || exit 1'] interval: 30s timeout: 5s retries: 3 # ============================================ # Phase 2b — Forgejo + Umami # ============================================ forgejo: image: codeberg.org/forgejo/forgejo:11 container_name: mana-core-forgejo restart: unless-stopped ports: - '3041:3000' - '2222:22' environment: USER_UID: 1000 USER_GID: 1000 FORGEJO__database__DB_TYPE: postgres FORGEJO__database__HOST: 192.168.178.131:5432 FORGEJO__database__NAME: forgejo FORGEJO__database__USER: postgres FORGEJO__database__PASSWD: ${POSTGRES_PASSWORD} FORGEJO__server__DOMAIN: git.mana.how FORGEJO__server__SSH_DOMAIN: git.mana.how FORGEJO__server__ROOT_URL: https://git.mana.how/ FORGEJO__server__HTTP_PORT: 3000 FORGEJO__server__SSH_PORT: 2222 FORGEJO__server__LFS_START_SERVER: 'true' FORGEJO__service__DISABLE_REGISTRATION: 'true' FORGEJO__service__REQUIRE_SIGNIN_VIEW: 'false' FORGEJO__actions__ENABLED: 'true' FORGEJO__actions__DEFAULT_ACTIONS_URL: https://code.forgejo.org FORGEJO__packages__ENABLED: 'true' FORGEJO__ui__DEFAULT_THEME: forgejo-dark FORGEJO__ui__SHOW_USER_EMAIL: 'false' FORGEJO__mailer__ENABLED: 'false' volumes: - ./forgejo-data:/data healthcheck: test: ['CMD', 'wget', '-q', '--spider', 'http://localhost:3000/api/v1/version'] interval: 120s timeout: 10s retries: 3 start_period: 60s umami: image: ghcr.io/umami-software/umami:postgresql-v2.18.0 container_name: mana-mon-umami restart: unless-stopped ports: - '8010:3000' environment: DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD}@192.168.178.131:5432/umami DATABASE_TYPE: postgresql APP_SECRET: ${UMAMI_APP_SECRET} DISABLE_TELEMETRY: '1' healthcheck: test: ['CMD-SHELL', 'wget -q -O- http://127.0.0.1:3000/api/heartbeat || exit 1'] interval: 30s timeout: 5s retries: 3 start_period: 30s # ============================================ # Phase 2c — Metrics-Stack # ============================================ victoriametrics: image: victoriametrics/victoria-metrics:v1.99.0 container_name: mana-mon-victoria extra_hosts: - 'host.docker.internal:host-gateway' restart: unless-stopped entrypoint: - /victoria-metrics-prod - -storageDataPath=/storage - -retentionPeriod=2y - -httpListenAddr=:9090 - -promscrape.config=/etc/prometheus/prometheus.yml - -promscrape.config.strictParse=false - -selfScrapeInterval=15s - -search.latencyOffset=0s volumes: - ./monitoring/prometheus:/etc/prometheus:ro - victoriametrics-data:/storage ports: - '9090:9090' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9090/health'] interval: 60s timeout: 10s retries: 3 start_period: 15s loki: image: grafana/loki:3.0.0 container_name: mana-mon-loki restart: unless-stopped entrypoint: ['sh', '-c', 'mkdir -p /etc/loki && cp /mnt/loki-config/*.yaml /etc/loki/ 2>/dev/null; exec /usr/bin/loki -config.file=/etc/loki/local-config.yaml'] volumes: - ./monitoring/loki:/mnt/loki-config:ro - loki-data:/loki ports: - '3100:3100' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:3100/ready'] interval: 60s timeout: 10s retries: 3 start_period: 15s pushgateway: image: prom/pushgateway:v1.7.0 container_name: mana-mon-pushgateway restart: unless-stopped ports: - '9091:9091' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9091/-/healthy'] interval: 60s timeout: 10s retries: 3 start_period: 20s blackbox-exporter: image: prom/blackbox-exporter:v0.25.0 container_name: mana-mon-blackbox restart: unless-stopped dns: - 1.1.1.1 - 8.8.8.8 command: ['--config.file=/etc/blackbox/blackbox.yml'] volumes: - ./monitoring/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro ports: - '9115:9115' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9115/'] interval: 60s timeout: 10s retries: 3 vmalert: image: victoriametrics/vmalert:v1.99.0 container_name: mana-mon-vmalert restart: unless-stopped depends_on: victoriametrics: condition: service_healthy alertmanager: condition: service_started entrypoint: - /vmalert-prod - -datasource.url=http://victoriametrics:9090 - -notifier.url=http://alertmanager:9093 - -remoteWrite.url=http://victoriametrics:9090 - -remoteRead.url=http://victoriametrics:9090 - -rule=/etc/alerts/alerts.yml - -evaluationInterval=30s - -httpListenAddr=:8880 volumes: - ./monitoring/prometheus:/etc/alerts:ro ports: - '8880:8880' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:8880/health'] interval: 60s timeout: 10s retries: 3 start_period: 25s alertmanager: image: prom/alertmanager:v0.27.0 container_name: mana-mon-alertmanager restart: unless-stopped depends_on: alert-notifier: condition: service_started command: - --config.file=/etc/alertmanager/alertmanager.yml - --storage.path=/alertmanager - --web.listen-address=:9093 volumes: - ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro - alertmanager-data:/alertmanager ports: - '9093:9093' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9093/-/healthy'] interval: 60s timeout: 10s retries: 3 start_period: 25s alert-notifier: build: context: ./monitoring/alert-notifier dockerfile: Dockerfile image: alert-notifier:gpu-box container_name: mana-mon-alert-notifier restart: unless-stopped environment: PORT: 8080 TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN} TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID} NTFY_TOPIC: ${NTFY_TOPIC:-} ports: - '9095:8080' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:8080/health'] interval: 60s timeout: 5s retries: 3 start_period: 25s # GPU-Box self-monitoring (each box runs its own node-exporter + cadvisor) gpu-node-exporter: image: prom/node-exporter:v1.7.0 container_name: mana-mon-gpu-node-exporter restart: unless-stopped command: - --collector.disable-defaults - --collector.cpu - --collector.meminfo - --collector.loadavg - --collector.filesystem - --collector.netdev - --collector.time - --collector.uname - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9100/metrics'] interval: 60s timeout: 10s retries: 3 start_period: 20s gpu-cadvisor: image: gcr.io/cadvisor/cadvisor:v0.49.1 container_name: mana-mon-gpu-cadvisor restart: unless-stopped privileged: true volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:8080/healthz'] interval: 60s timeout: 10s retries: 3 start_period: 20s gpu-promtail: image: grafana/promtail:3.0.0 container_name: mana-mon-gpu-promtail restart: unless-stopped command: ['-config.file=/etc/promtail/config.yaml', '-config.expand-env=true'] volumes: - ./monitoring/promtail-gpu:/etc/promtail:ro - /var/run/docker.sock:/var/run/docker.sock:ro depends_on: loki: condition: service_started # healthcheck disabled: promtail image has no curl/wget/nc; restart policy handles crashes # ============================================ # Phase 2d — Glitchtip mit dedizierter Postgres + Redis (2026-05-06) # Mini-Postgres hatte Partition-Creation-Permission-Issues # (macOS-Docker-Storage-Quirk auf externer SSD), daher eigener Stack hier. # ============================================ glitchtip-postgres: image: postgres:16-alpine container_name: mana-mon-glitchtip-postgres restart: unless-stopped environment: POSTGRES_USER: postgres POSTGRES_PASSWORD: ${GLITCHTIP_DB_PASSWORD} POSTGRES_DB: glitchtip volumes: - glitchtip-pg-data:/var/lib/postgresql/data healthcheck: test: ['CMD-SHELL', 'pg_isready -U postgres -d glitchtip'] interval: 30s timeout: 5s retries: 5 start_period: 15s glitchtip-redis: image: redis:7-alpine container_name: mana-mon-glitchtip-redis restart: unless-stopped command: ['redis-server', '--maxmemory', '128mb', '--maxmemory-policy', 'allkeys-lru'] healthcheck: test: ['CMD', 'redis-cli', 'ping'] interval: 30s timeout: 3s retries: 3 start_period: 5s glitchtip: image: glitchtip/glitchtip:latest container_name: mana-mon-glitchtip restart: unless-stopped depends_on: glitchtip-postgres: condition: service_healthy glitchtip-redis: condition: service_healthy environment: DATABASE_URL: postgres://postgres:${GLITCHTIP_DB_PASSWORD}@glitchtip-postgres:5432/glitchtip REDIS_URL: redis://glitchtip-redis:6379/0 SECRET_KEY: ${GLITCHTIP_SECRET_KEY} DEFAULT_FROM_EMAIL: glitchtip@mana.how ENABLE_USER_REGISTRATION: 'true' PORT: '8020' GLITCHTIP_DOMAIN: https://glitchtip.mana.how CELERY_WORKER_AUTOSCALE: '1,3' # Heroku-style trigger so start.sh runs ./manage.py migrate at boot: DYNO: web.1 ports: - '8020:8020' healthcheck: test: ['CMD', 'python3', '-c', 'import urllib.request; urllib.request.urlopen("http://localhost:8020/_health/")'] interval: 60s timeout: 10s retries: 3 start_period: 90s glitchtip-worker: image: glitchtip/glitchtip:latest container_name: mana-mon-glitchtip-worker restart: unless-stopped depends_on: glitchtip: condition: service_started command: ['./bin/run-celery-with-beat.sh'] environment: DATABASE_URL: postgres://postgres:${GLITCHTIP_DB_PASSWORD}@glitchtip-postgres:5432/glitchtip REDIS_URL: redis://glitchtip-redis:6379/0 SECRET_KEY: ${GLITCHTIP_SECRET_KEY} GLITCHTIP_DOMAIN: https://glitchtip.mana.how CELERY_WORKER_AUTOSCALE: '1,3' # ============================================ # Phase 2e — Status-Page (2026-05-07): generator + nginx auf GPU-Box. # Vorher auf Mini, jetzt direkt neben VM/Loki — keine vm.mana.how- # Public-API-Exposure mehr nötig, kein Cloudflare-Round-Trip pro Query. # ============================================ status-page-gen: image: alpine:3.20 container_name: mana-mon-status-gen restart: unless-stopped depends_on: victoriametrics: condition: service_healthy environment: VICTORIAMETRICS_URL: http://victoriametrics:9090 OUTPUT_FILE: /output/index.html volumes: # Bind from sparse clone (auto-pulled hourly via systemd timer # mana-source-pull.timer; restart hier nicht nötig da Container # cp /generate.sh /tmp/ beim ersten Tick liest) - /srv/mana/source/scripts/generate-status-page.sh:/generate.sh:ro - /srv/mana/source/packages/shared-branding/src/mana-apps.ts:/mana-apps.ts:ro - status-output:/output command: - sh - -c - | apk add --no-cache curl jq || { echo "apk add fehlgeschlagen"; sleep 10; exit 1; } mkdir -p /output while true; do cp /generate.sh /tmp/generate.sh sh /tmp/generate.sh sleep 60 done status-nginx: image: nginx:alpine container_name: mana-mon-status-nginx restart: unless-stopped depends_on: status-page-gen: condition: service_started ports: - '8090:80' volumes: - status-output:/usr/share/nginx/html:ro healthcheck: test: ['CMD', 'wget', '-q', '-O-', '-T', '3', 'http://127.0.0.1/status.json'] interval: 60s timeout: 5s retries: 3 start_period: 30s volumes: glitchtip-pg-data: status-output: mana-grafana-data: victoriametrics-data: loki-data: alertmanager-data: