# Mana GPU-Box stack — Workload-Split mit dem Mac Mini. # Phase 2c (2026-05-06): Metrics-Stack komplett auf GPU-Box. # Production-Hot-Path bleibt unverändert auf dem Mini. # # Architektur: # - Apps hier (Grafana, Forgejo, Umami, Glitchtip-future) lesen Postgres # auf 192.168.178.131:5432 als SoT. # - VictoriaMetrics scrapt Mac-Mini-Services via 192.168.178.131: # (siehe monitoring/prometheus/prometheus.yml) und GPU-Box-eigene # node-exporter + cadvisor lokal. # - Loki empfängt Logs vom Mini-Promtail (Push-Modus, Mini schickt # zu http://192.168.178.11:3100) UND vom GPU-Box-eigenen Promtail. # - Cutover via Cloudflare-Tunnel `mana-gpu-server`. services: # ============================================ # Phase 2a — Grafana (UI) # ============================================ grafana: image: grafana/grafana:10.4.1 container_name: mana-mon-grafana restart: unless-stopped ports: - '8000:3000' environment: GF_SECURITY_ADMIN_USER: admin GF_SECURITY_ADMIN_PASSWORD: ${GF_ADMIN_PASSWORD} GF_SERVER_ROOT_URL: https://grafana.mana.how GF_INSTALL_PLUGINS: yesoreyeram-infinity-datasource POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} GLITCHTIP_DB_PASSWORD: ${GLITCHTIP_DB_PASSWORD} volumes: - mana-grafana-data:/var/lib/grafana - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro healthcheck: test: ['CMD-SHELL', 'wget -q -O- http://localhost:3000/api/health || exit 1'] interval: 30s timeout: 5s retries: 3 # ============================================ # Phase 2b — Forgejo + Umami # ============================================ forgejo: image: codeberg.org/forgejo/forgejo:11 container_name: mana-core-forgejo restart: unless-stopped ports: - '3041:3000' - '2222:22' environment: USER_UID: 1000 USER_GID: 1000 FORGEJO__database__DB_TYPE: postgres FORGEJO__database__HOST: 192.168.178.131:5432 FORGEJO__database__NAME: forgejo FORGEJO__database__USER: postgres FORGEJO__database__PASSWD: ${POSTGRES_PASSWORD} FORGEJO__server__DOMAIN: git.mana.how FORGEJO__server__SSH_DOMAIN: git.mana.how FORGEJO__server__ROOT_URL: https://git.mana.how/ FORGEJO__server__HTTP_PORT: 3000 FORGEJO__server__SSH_PORT: 2222 FORGEJO__server__LFS_START_SERVER: 'true' FORGEJO__service__DISABLE_REGISTRATION: 'true' FORGEJO__service__REQUIRE_SIGNIN_VIEW: 'false' FORGEJO__actions__ENABLED: 'true' FORGEJO__actions__DEFAULT_ACTIONS_URL: https://code.forgejo.org FORGEJO__packages__ENABLED: 'true' FORGEJO__ui__DEFAULT_THEME: forgejo-dark FORGEJO__ui__SHOW_USER_EMAIL: 'false' FORGEJO__mailer__ENABLED: 'false' volumes: - ./forgejo-data:/data healthcheck: test: ['CMD', 'wget', '-q', '--spider', 'http://localhost:3000/api/v1/version'] interval: 120s timeout: 10s retries: 3 start_period: 60s umami: image: ghcr.io/umami-software/umami:postgresql-v2.18.0 container_name: mana-mon-umami restart: unless-stopped ports: - '8010:3000' environment: DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD}@192.168.178.131:5432/umami DATABASE_TYPE: postgresql APP_SECRET: ${UMAMI_APP_SECRET} DISABLE_TELEMETRY: '1' healthcheck: test: ['CMD-SHELL', 'wget -q -O- http://127.0.0.1:3000/api/heartbeat || exit 1'] interval: 30s timeout: 5s retries: 3 start_period: 30s # ============================================ # Photon Geocoder (pre-existing user container, adopted into compose # 2026-05-07 to gain a healthcheck). Backs mana-geocoding's # `privacy:'local'` provider via photon.mana.how (GPU-tunnel). # OSM index lives in /opt/photon-data on the WSL2 host — survives # container recreation. # ============================================ photon: image: eclipse-temurin:21-jre container_name: photon restart: unless-stopped working_dir: /photon environment: LC_ALL: en_US.UTF-8 ports: - '2322:2322' volumes: - /opt/photon-data/photon.jar:/photon/photon.jar:ro - /opt/photon-data/photon_data:/photon/photon_data command: - java - -jar - photon.jar - serve - -listen-ip - 0.0.0.0 - -listen-port - '2322' healthcheck: # Photon hat keinen /health-Endpoint — kanonische Probe ist eine # winzige Geocoding-Query. Limit=1 hält den Response klein. test: ['CMD-SHELL', 'curl -sf -o /dev/null "http://127.0.0.1:2322/api?q=Berlin&limit=1" || exit 1'] interval: 60s timeout: 10s retries: 3 start_period: 120s # ============================================ # Phase 2c — Metrics-Stack # ============================================ victoriametrics: image: victoriametrics/victoria-metrics:v1.99.0 container_name: mana-mon-victoria extra_hosts: - 'host.docker.internal:host-gateway' restart: unless-stopped entrypoint: - /victoria-metrics-prod - -storageDataPath=/storage - -retentionPeriod=2y - -httpListenAddr=:9090 - -promscrape.config=/etc/prometheus/prometheus.yml - -promscrape.config.strictParse=false - -selfScrapeInterval=15s - -search.latencyOffset=0s volumes: - ./monitoring/prometheus:/etc/prometheus:ro - victoriametrics-data:/storage ports: - '9090:9090' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9090/health'] interval: 60s timeout: 10s retries: 3 start_period: 15s loki: image: grafana/loki:3.0.0 container_name: mana-mon-loki restart: unless-stopped entrypoint: ['sh', '-c', 'mkdir -p /etc/loki && cp /mnt/loki-config/*.yaml /etc/loki/ 2>/dev/null; exec /usr/bin/loki -config.file=/etc/loki/local-config.yaml'] volumes: - ./monitoring/loki:/mnt/loki-config:ro - loki-data:/loki ports: - '3100:3100' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:3100/ready'] interval: 60s timeout: 10s retries: 3 start_period: 15s pushgateway: image: prom/pushgateway:v1.7.0 container_name: mana-mon-pushgateway restart: unless-stopped ports: - '9091:9091' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9091/-/healthy'] interval: 60s timeout: 10s retries: 3 start_period: 20s blackbox-exporter: image: prom/blackbox-exporter:v0.25.0 container_name: mana-mon-blackbox restart: unless-stopped dns: - 1.1.1.1 - 8.8.8.8 command: ['--config.file=/etc/blackbox/blackbox.yml'] volumes: - ./monitoring/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro ports: - '9115:9115' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9115/'] interval: 60s timeout: 10s retries: 3 vmalert: image: victoriametrics/vmalert:v1.99.0 container_name: mana-mon-vmalert restart: unless-stopped depends_on: victoriametrics: condition: service_healthy alertmanager: condition: service_started entrypoint: - /vmalert-prod - -datasource.url=http://victoriametrics:9090 - -notifier.url=http://alertmanager:9093 - -remoteWrite.url=http://victoriametrics:9090 - -remoteRead.url=http://victoriametrics:9090 - -rule=/etc/alerts/alerts.yml - -evaluationInterval=30s - -httpListenAddr=:8880 volumes: - ./monitoring/prometheus:/etc/alerts:ro ports: - '8880:8880' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:8880/health'] interval: 60s timeout: 10s retries: 3 start_period: 25s alertmanager: image: prom/alertmanager:v0.27.0 container_name: mana-mon-alertmanager restart: unless-stopped depends_on: alert-notifier: condition: service_started command: - --config.file=/etc/alertmanager/alertmanager.yml - --storage.path=/alertmanager - --web.listen-address=:9093 volumes: - ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro - alertmanager-data:/alertmanager ports: - '9093:9093' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9093/-/healthy'] interval: 60s timeout: 10s retries: 3 start_period: 25s alert-notifier: build: context: ./monitoring/alert-notifier dockerfile: Dockerfile image: alert-notifier:gpu-box container_name: mana-mon-alert-notifier restart: unless-stopped environment: PORT: 8080 TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN} TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID} NTFY_TOPIC: ${NTFY_TOPIC:-} ports: - '9095:8080' healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:8080/health'] interval: 60s timeout: 5s retries: 3 start_period: 25s # GPU-Box self-monitoring (each box runs its own node-exporter + cadvisor) gpu-node-exporter: image: prom/node-exporter:v1.7.0 container_name: mana-mon-gpu-node-exporter restart: unless-stopped command: - --collector.disable-defaults - --collector.cpu - --collector.meminfo - --collector.loadavg - --collector.filesystem - --collector.netdev - --collector.time - --collector.uname - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:9100/metrics'] interval: 60s timeout: 10s retries: 3 start_period: 20s gpu-cadvisor: image: gcr.io/cadvisor/cadvisor:v0.49.1 container_name: mana-mon-gpu-cadvisor restart: unless-stopped privileged: true volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro healthcheck: test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://127.0.0.1:8080/healthz'] interval: 60s timeout: 10s retries: 3 start_period: 20s gpu-promtail: image: grafana/promtail:3.0.0 container_name: mana-mon-gpu-promtail restart: unless-stopped command: ['-config.file=/etc/promtail/config.yaml', '-config.expand-env=true'] volumes: - ./monitoring/promtail-gpu:/etc/promtail:ro - /var/run/docker.sock:/var/run/docker.sock:ro depends_on: loki: condition: service_started healthcheck: test: ['CMD', 'bash', '-c', 'exec 3<>/dev/tcp/loki/3100'] interval: 60s timeout: 5s retries: 3 start_period: 15s # ============================================ # Phase 2d — Glitchtip mit dedizierter Postgres + Redis (2026-05-06) # Mini-Postgres hatte Partition-Creation-Permission-Issues # (macOS-Docker-Storage-Quirk auf externer SSD), daher eigener Stack hier. # ============================================ glitchtip-postgres: image: postgres:16-alpine container_name: mana-mon-glitchtip-postgres restart: unless-stopped environment: POSTGRES_USER: postgres POSTGRES_PASSWORD: ${GLITCHTIP_DB_PASSWORD} POSTGRES_DB: glitchtip volumes: - glitchtip-pg-data:/var/lib/postgresql/data healthcheck: test: ['CMD-SHELL', 'pg_isready -U postgres -d glitchtip'] interval: 30s timeout: 5s retries: 5 start_period: 15s glitchtip-redis: image: redis:7-alpine container_name: mana-mon-glitchtip-redis restart: unless-stopped command: ['redis-server', '--maxmemory', '128mb', '--maxmemory-policy', 'allkeys-lru'] healthcheck: test: ['CMD', 'redis-cli', 'ping'] interval: 30s timeout: 3s retries: 3 start_period: 5s glitchtip: image: glitchtip/glitchtip:latest container_name: mana-mon-glitchtip restart: unless-stopped depends_on: glitchtip-postgres: condition: service_healthy glitchtip-redis: condition: service_healthy environment: DATABASE_URL: postgres://postgres:${GLITCHTIP_DB_PASSWORD}@glitchtip-postgres:5432/glitchtip REDIS_URL: redis://glitchtip-redis:6379/0 SECRET_KEY: ${GLITCHTIP_SECRET_KEY} DEFAULT_FROM_EMAIL: glitchtip@mana.how ENABLE_USER_REGISTRATION: 'true' PORT: '8020' GLITCHTIP_DOMAIN: https://glitchtip.mana.how CELERY_WORKER_AUTOSCALE: '1,3' # Heroku-style trigger so start.sh runs ./manage.py migrate at boot: DYNO: web.1 ports: - '8020:8020' healthcheck: test: ['CMD', 'python3', '-c', 'import urllib.request; urllib.request.urlopen("http://localhost:8020/_health/")'] interval: 60s timeout: 10s retries: 3 start_period: 90s glitchtip-worker: image: glitchtip/glitchtip:latest container_name: mana-mon-glitchtip-worker restart: unless-stopped depends_on: glitchtip: condition: service_started command: ['./bin/run-celery-with-beat.sh'] environment: DATABASE_URL: postgres://postgres:${GLITCHTIP_DB_PASSWORD}@glitchtip-postgres:5432/glitchtip REDIS_URL: redis://glitchtip-redis:6379/0 SECRET_KEY: ${GLITCHTIP_SECRET_KEY} GLITCHTIP_DOMAIN: https://glitchtip.mana.how CELERY_WORKER_AUTOSCALE: '1,3' healthcheck: test: ['CMD', 'bash', '-c', 'exec 3<>/dev/tcp/glitchtip-redis/6379'] interval: 60s timeout: 5s retries: 3 start_period: 30s # ============================================ # Phase 2e — Status-Page (2026-05-07): generator + nginx auf GPU-Box. # Vorher auf Mini, jetzt direkt neben VM/Loki — keine vm.mana.how- # Public-API-Exposure mehr nötig, kein Cloudflare-Round-Trip pro Query. # ============================================ status-page-gen: image: alpine:3.20 container_name: mana-mon-status-gen restart: unless-stopped depends_on: victoriametrics: condition: service_healthy environment: VICTORIAMETRICS_URL: http://victoriametrics:9090 OUTPUT_FILE: /output/index.html volumes: # Bind from sparse clone (auto-pulled hourly via systemd timer # mana-source-pull.timer; restart hier nicht nötig da Container # cp /generate.sh /tmp/ beim ersten Tick liest) - /srv/mana/source/scripts/generate-status-page.sh:/generate.sh:ro - /srv/mana/source/packages/shared-branding/src/mana-apps.ts:/mana-apps.ts:ro - status-output:/output command: - sh - -c - | apk add --no-cache curl jq || { echo "apk add fehlgeschlagen"; sleep 10; exit 1; } mkdir -p /output while true; do cp /generate.sh /tmp/generate.sh sh /tmp/generate.sh sleep 60 done healthcheck: test: ['CMD-SHELL', '[ -f /output/status.json ] && [ $$(( $$(date +%s) - $$(stat -c %Y /output/status.json) )) -lt 180 ]'] interval: 90s timeout: 5s retries: 2 start_period: 60s status-nginx: image: nginx:alpine container_name: mana-mon-status-nginx restart: unless-stopped depends_on: status-page-gen: condition: service_started ports: - '8090:80' volumes: - status-output:/usr/share/nginx/html:ro healthcheck: test: ['CMD', 'wget', '-q', '-O-', '-T', '3', 'http://127.0.0.1/status.json'] interval: 60s timeout: 5s retries: 3 start_period: 30s # ============================================ # Phase 2f-2 — news-ingester (2026-05-07) # Background article-ingester — Bun-Service mit 15-min-Tick. Schreibt # in Mini-Postgres mana_platform via LAN. Reine Background-Last, # null Hot-Path-Coupling. # ============================================ news-ingester: # GPU-Box-spezifischer Build (services/news-ingester/Dockerfile ist # nicht workspace-aware; der hier vendored shared-rss als file: ref). # Cross-arch: GPU-Box ist x86_64, Mini ist arm64 — Image-Transfer # via docker save/load würde "exec format error" werfen. build: context: /srv/mana/source dockerfile: infrastructure/news-ingester/Dockerfile image: news-ingester:gpu-box container_name: news-ingester restart: unless-stopped environment: TZ: Europe/Berlin PORT: 3066 DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD}@192.168.178.131:5432/mana_platform TICK_INTERVAL_MS: '900000' RUN_ON_STARTUP: 'true' healthcheck: test: ['CMD', 'bun', '-e', "fetch('http://127.0.0.1:3066/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"] interval: 120s timeout: 10s retries: 3 start_period: 30s # ============================================ # Phase 2g — mana-research (Web Research Orchestrator, 2026-05-07) # 16+ search-providers + LLM-Pipeline. User-facing aber latenztolerant # (5-30s queries). Schreibt in mana_platform.research-Tabellen, nutzt # Redis-Cache + ruft mana-credits + mana-search auf. Geographische Nähe # zu gpu-llm/gpu-ollama wie bei mana-ai. # ============================================ mana-research: build: context: /srv/mana/source dockerfile: services/mana-research/Dockerfile image: mana-research:gpu-box container_name: mana-research restart: unless-stopped environment: TZ: Europe/Berlin NODE_ENV: production PORT: 3068 DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD}@192.168.178.131:5432/mana_platform REDIS_URL: redis://:${REDIS_PASSWORD}@192.168.178.131:6379 CACHE_TTL_SECONDS: '3600' CORS_ORIGINS: https://mana.how,https://chat.mana.how,https://research.mana.how # Cross-LAN service deps MANA_AUTH_URL: http://192.168.178.131:3001 MANA_CREDITS_URL: http://192.168.178.131:3002 MANA_LLM_URL: http://192.168.178.131:3025 MANA_SEARCH_URL: http://192.168.178.131:3012 MANA_SERVICE_KEY: ${MANA_SERVICE_KEY} # LLM + Search provider keys (most empty, only Google active currently) ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} OPENAI_API_KEY: ${OPENAI_API_KEY:-} GOOGLE_GENAI_API_KEY: ${GOOGLE_GENAI_API_KEY:-} PERPLEXITY_API_KEY: ${PERPLEXITY_API_KEY:-} EXA_API_KEY: ${EXA_API_KEY:-} TAVILY_API_KEY: ${TAVILY_API_KEY:-} BRAVE_API_KEY: ${BRAVE_API_KEY:-} SERPER_API_KEY: ${SERPER_API_KEY:-} JINA_API_KEY: ${JINA_API_KEY:-} FIRECRAWL_API_KEY: ${FIRECRAWL_API_KEY:-} SCRAPINGBEE_API_KEY: ${SCRAPINGBEE_API_KEY:-} ports: - '3068:3068' healthcheck: test: ['CMD', 'bun', '-e', "fetch('http://localhost:3068/health').then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))"] interval: 60s timeout: 10s retries: 3 start_period: 30s # ============================================ # Phase 2f-3 — mana-ai (AI Mission Runner, 2026-05-07) # Background tick-loop (60s default), queryt mana-api + mana-llm + # mana-research, persistiert via mana_sync. Geographische Nähe zum # gpu-llm/gpu-ollama-Stack ist Bonus für künftige direct-LLM-Pfade. # services/mana-ai/Dockerfile ist bereits workspace-aware (pnpm # multi-stage), funktioniert nativ auf GPU-Box. # ============================================ mana-ai: build: context: /srv/mana/source dockerfile: services/mana-ai/Dockerfile image: mana-ai:gpu-box container_name: mana-ai restart: unless-stopped environment: TZ: Europe/Berlin PORT: 3067 NODE_ENV: production TICK_INTERVAL_MS: '60000' TICK_ENABLED: 'true' MANA_AI_DEEP_RESEARCH_ENABLED: 'false' # Cross-LAN dependency wiring — alle Mini-Services via Host-Port. MANA_API_URL: http://192.168.178.131:3060 SYNC_DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD}@192.168.178.131:5432/mana_sync MANA_LLM_URL: http://192.168.178.131:3025 MANA_RESEARCH_URL: http://192.168.178.131:3068 # Internal-service-auth + RSA für Mission-Grant-Decryption. MANA_SERVICE_KEY: ${MANA_SERVICE_KEY} MANA_AI_PRIVATE_KEY_PEM: ${MANA_AI_PRIVATE_KEY_PEM} # OTEL: Tempo läuft seit Phase 2c nicht mehr — leer = SDK no-op OTEL_EXPORTER_OTLP_ENDPOINT: '' ports: - '3067:3067' healthcheck: test: ['CMD', 'bun', '-e', "fetch('http://localhost:3067/health').then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))"] interval: 60s timeout: 10s retries: 3 start_period: 30s volumes: glitchtip-pg-data: status-output: mana-grafana-data: victoriametrics-data: loki-data: alertmanager-data: