chore(infra): drop migrated services from Mini compose + tunnel config

Phase 2c+2d cleanup. The 14 services that moved to the GPU-Box stack
(grafana, victoriametrics, loki, tempo, promtail, alertmanager,
vmalert, pushgateway, blackbox-exporter, alert-notifier, umami,
glitchtip + worker, forgejo) are now stopped on the Mini and stable
on the GPU box, so the rollback insurance can come out:

- docker-compose.macmini.yml: drop 14 service blocks (-369 lines) +
  the now-orphan named volumes (victoriametrics_data, loki_data,
  alertmanager_data, grafana_data, tempo_data).
- cloudflared-config.yml: drop the four hostnames whose DNS already
  points at the mana-gpu-server tunnel — Mini-tunnel ingress for them
  has been dead routing since 2026-05-06, removing the rules just makes
  the file match reality. The hostnames now live in the GPU tunnel's
  dashboard config (token-managed).

Containers + volumes stay on the Mini for now; running
`docker compose -f docker-compose.macmini.yml --env-file .env.macmini up -d --remove-orphans`
on the box drops them in one go when ready.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-05-07 02:39:43 +02:00
parent f422fd6779
commit 0db64cb47b
2 changed files with 0 additions and 387 deletions

View file

@ -132,8 +132,6 @@ ingress:
# ============================================
# Forgejo (Git + CI/CD)
# ============================================
- hostname: git.mana.how
service: http://localhost:3041
# ============================================
# Standalone microservices
@ -215,12 +213,6 @@ ingress:
# ============================================
# Monitoring & observability
# ============================================
- hostname: grafana.mana.how
service: http://localhost:8000
- hostname: stats.mana.how
service: http://localhost:8010
- hostname: glitchtip.mana.how
service: http://localhost:8020
# ============================================
# GPU services (NOT in this tunnel)

View file

@ -202,56 +202,6 @@ services:
# Tier 0b: Forgejo (Git + CI/CD + Registry)
# ============================================
forgejo:
image: codeberg.org/forgejo/forgejo:11
container_name: mana-core-forgejo
restart: always
mem_limit: 512m
depends_on:
postgres:
condition: service_healthy
environment:
USER_UID: 1000
USER_GID: 1000
FORGEJO__database__DB_TYPE: postgres
FORGEJO__database__HOST: postgres:5432
FORGEJO__database__NAME: forgejo
FORGEJO__database__USER: postgres
FORGEJO__database__PASSWD: ${POSTGRES_PASSWORD:-mana123}
FORGEJO__server__DOMAIN: git.mana.how
FORGEJO__server__SSH_DOMAIN: git.mana.how
FORGEJO__server__ROOT_URL: https://git.mana.how/
FORGEJO__server__HTTP_PORT: 3000
FORGEJO__server__SSH_PORT: 2222
FORGEJO__server__LFS_START_SERVER: "true"
FORGEJO__service__DISABLE_REGISTRATION: "true"
FORGEJO__service__REQUIRE_SIGNIN_VIEW: "false"
FORGEJO__actions__ENABLED: "true"
FORGEJO__actions__DEFAULT_ACTIONS_URL: https://code.forgejo.org
FORGEJO__packages__ENABLED: "true"
FORGEJO__ui__DEFAULT_THEME: forgejo-dark
FORGEJO__ui__SHOW_USER_EMAIL: "false"
FORGEJO__mailer__ENABLED: "false"
volumes:
- /Volumes/ManaData/forgejo:/data
ports:
- "3041:3000"
- "2222:22"
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/api/v1/version"]
interval: 120s
timeout: 10s
retries: 3
start_period: 30s
# Forgejo runner removed — no macOS binary exists, Docker-based runner
# can't access host filesystem/SSH for CD. GitHub CD handles deployment
# via native self-hosted runner. Forgejo is kept as a mirror only.
# ============================================
# Tier 1: Core Auth Service (Port 3001)
# ============================================
mana-auth:
build:
context: .
@ -1281,163 +1231,6 @@ services:
# Tier 7: Monitoring Dashboards (Ports 8000-8099)
# ============================================
grafana:
image: grafana/grafana:10.4.1
container_name: mana-mon-grafana
restart: always
mem_limit: 192m
depends_on:
victoriametrics:
condition: service_healthy
environment:
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD:-admin}
GF_USERS_ALLOW_SIGN_UP: false
GF_AUTH_ANONYMOUS_ENABLED: true
GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer
GF_SERVER_ROOT_URL: https://grafana.mana.how
GF_SERVER_HTTP_PORT: 8000
GF_INSTALL_PLUGINS: yesoreyeram-infinity-datasource
GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH: /var/lib/grafana/dashboards/master-overview.json
volumes:
- ./docker/grafana/provisioning:/etc/grafana/provisioning:ro
- ./docker/grafana/dashboards:/var/lib/grafana/dashboards:ro
- grafana_data:/var/lib/grafana
ports:
- "8000:8000"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8000/api/health"]
interval: 300s
timeout: 10s
retries: 3
start_period: 10s
umami:
# Pinned away from postgresql-latest on 2026-04-23. The rolling
# tag jumped to Umami 3.1.0 (Next.js 16) and started crashing the
# container on every POST /api/send — page loaders hung on the
# failing tracker request. v2.18.0 is the last known-stable v2.
# Rolling back to v2 was safe here because the schema is shared
# across 2.x. If you bump to v3 again, verify the DB migration
# path and test /api/send with a real POST before committing.
image: ghcr.io/umami-software/umami:postgresql-v2.18.0
container_name: mana-mon-umami
restart: always
mem_limit: 384m
depends_on:
postgres:
condition: service_healthy
environment:
DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD:-mana123}@postgres:5432/umami
DATABASE_TYPE: postgresql
APP_SECRET: ${UMAMI_APP_SECRET:-change-me-umami-secret}
DISABLE_TELEMETRY: 1
ports:
- "8010:3000"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3000/api/heartbeat"]
interval: 300s
timeout: 10s
retries: 3
start_period: 30s
# ============================================
# Tier 8: Metrics & Exporters (Ports 9000-9199)
# ============================================
victoriametrics:
image: victoriametrics/victoria-metrics:v1.99.0
container_name: mana-mon-victoria
restart: always
mem_limit: 384m
# Mount the host config dir read-only and point promscrape directly at it,
# so edits to docker/prometheus/prometheus.yml are picked up by POST /-/reload
# without a container restart. The previous setup baked a copy into
# /etc/prometheus/ at startup, which silently drifted from the host file
# whenever the container wasn't restarted (matrix removal incident, 2026-04-08).
entrypoint: ["/victoria-metrics-prod", "-storageDataPath=/storage", "-retentionPeriod=2y", "-httpListenAddr=:9090", "-promscrape.config=/etc/prometheus/prometheus.yml", "-promscrape.config.strictParse=false", "-selfScrapeInterval=15s", "-search.latencyOffset=0s"]
volumes:
- ./docker/prometheus:/etc/prometheus:ro
- victoriametrics_data:/storage
ports:
- "9090:9090"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9090/health"]
interval: 300s
timeout: 10s
retries: 3
start_period: 10s
tempo:
image: grafana/tempo:2.6.1
container_name: mana-mon-tempo
restart: always
mem_limit: 256m
command: ["-config.file=/etc/tempo/tempo.yaml"]
volumes:
- ./docker/tempo:/etc/tempo:ro
- tempo_data:/var/tempo
ports:
- "4318:4318" # OTLP HTTP receiver
- "3200:3200" # Tempo API (for Grafana)
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3200/ready"]
interval: 300s
timeout: 10s
retries: 3
start_period: 10s
loki:
image: grafana/loki:3.0.0
container_name: mana-mon-loki
restart: always
mem_limit: 192m
entrypoint: ["sh", "-c", "mkdir -p /etc/loki && cp /mnt/loki-config/*.yaml /etc/loki/ 2>/dev/null; exec /usr/bin/loki -config.file=/etc/loki/local-config.yaml"]
volumes:
- ./docker/loki:/mnt/loki-config:ro
- loki_data:/loki
ports:
- "3100:3100"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3100/ready"]
interval: 300s
timeout: 10s
retries: 3
start_period: 15s
promtail:
image: grafana/promtail:3.0.0
container_name: mana-mon-promtail
restart: always
mem_limit: 96m
command: -config.file=/etc/promtail/config.yaml -config.expand-env=true
volumes:
- ./docker/promtail:/etc/promtail:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
depends_on:
loki:
condition: service_started
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9080/ready"]
interval: 300s
timeout: 10s
retries: 3
start_period: 10s
pushgateway:
image: prom/pushgateway:v1.7.0
container_name: mana-mon-pushgateway
restart: always
mem_limit: 48m
ports:
- "9091:9091"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9091/-/healthy"]
interval: 300s
timeout: 10s
retries: 3
start_period: 20s
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
container_name: mana-mon-cadvisor
@ -1541,112 +1334,6 @@ services:
sleep 60
done
blackbox-exporter:
image: prom/blackbox-exporter:v0.25.0
container_name: mana-mon-blackbox
restart: always
mem_limit: 128m
# Use Cloudflare + Google public resolvers instead of Docker's
# embedded DNS (127.0.0.11). Docker DNS forwards to the host
# resolver which forwards to the home router (FRITZ!Box), and the
# router keeps a stale negative cache for hours after a hostname
# first fails. New CNAMEs (e.g. fresh GPU public hostnames added
# via the Cloudflare dashboard) appear as "no such host" to the
# blackbox probes for the entire negative-cache TTL even though
# they resolve fine via 1.1.1.1 directly.
dns:
- 1.1.1.1
- 8.8.8.8
command: ["--config.file=/etc/blackbox/blackbox.yml"]
volumes:
- ./docker/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro
ports:
- "9115:9115"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9115/"]
interval: 300s
timeout: 10s
retries: 3
start_period: 10s
# ============================================
# Alerting Stack (Ports 9093-9095)
# ============================================
vmalert:
image: victoriametrics/vmalert:v1.99.0
container_name: mana-mon-vmalert
restart: always
mem_limit: 64m
depends_on:
victoriametrics:
condition: service_healthy
alertmanager:
condition: service_healthy
# Same direct-mount pattern as victoriametrics above — see the comment
# there for the rationale.
entrypoint: ["/vmalert-prod", "-datasource.url=http://victoriametrics:9090", "-notifier.url=http://alertmanager:9093", "-remoteWrite.url=http://victoriametrics:9090", "-remoteRead.url=http://victoriametrics:9090", "-rule=/etc/alerts/alerts.yml", "-evaluationInterval=30s", "-httpListenAddr=:8880"]
volumes:
- ./docker/prometheus:/etc/alerts:ro
ports:
- "8880:8880"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8880/health"]
interval: 300s
timeout: 10s
retries: 3
start_period: 25s
alertmanager:
image: prom/alertmanager:v0.27.0
container_name: mana-mon-alertmanager
restart: always
mem_limit: 64m
depends_on:
alert-notifier:
condition: service_healthy
command: ["--config.file=/etc/alertmanager/alertmanager.yml", "--storage.path=/alertmanager", "--web.listen-address=:9093"]
volumes:
- ./docker/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
ports:
- "9093:9093"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9093/-/healthy"]
interval: 300s
timeout: 10s
retries: 3
start_period: 25s
alert-notifier:
build:
context: ./docker/alert-notifier
dockerfile: Dockerfile
image: alert-notifier:local
container_name: mana-mon-alert-notifier
restart: always
# Tier-3 right-size 2026-04-28: live RSS ~25 MiB (79%) — at OOM
# risk during alert-burst when many alerts queue at once. Bumped
# to 48m.
mem_limit: 48m
environment:
PORT: 8080
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID:-}
NTFY_TOPIC: ${NTFY_TOPIC:-}
ports:
- "9095:8080"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"]
interval: 300s
timeout: 5s
retries: 3
start_period: 25s
# ============================================
# Auto-Update (Watchtower)
# ============================================
watchtower:
image: nickfedor/watchtower:latest
container_name: mana-auto-watchtower
@ -1669,62 +1356,6 @@ services:
# GlitchTip Error Tracking (Sentry-compatible)
# ============================================
glitchtip:
image: glitchtip/glitchtip:latest
container_name: mana-mon-glitchtip
restart: always
mem_limit: 384m
environment:
DATABASE_URL: postgres://postgres:${POSTGRES_PASSWORD:-mana123}@postgres:5432/glitchtip
REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/1
SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change-me-in-production}
PORT: "8020"
GLITCHTIP_DOMAIN: https://glitchtip.mana.how
DEFAULT_FROM_EMAIL: glitchtip@mana.how
CELERY_WORKER_AUTOSCALE: "1,3"
ENABLE_USER_REGISTRATION: "true"
ports:
- "8020:8020"
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8020/_health/')"]
interval: 300s
timeout: 10s
retries: 3
start_period: 30s
glitchtip-worker:
image: glitchtip/glitchtip:latest
container_name: mana-mon-glitchtip-worker
restart: always
mem_limit: 192m
command: ./bin/run-celery-with-beat.sh
environment:
DATABASE_URL: postgres://postgres:${POSTGRES_PASSWORD:-mana123}@postgres:5432/glitchtip
REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/1
SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change-me-in-production}
GLITCHTIP_DOMAIN: https://glitchtip.mana.how
CELERY_WORKER_AUTOSCALE: "1,3"
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
# ============================================
# Unified API Server
# ============================================
# apps/api — Hono/Bun process that hosts all 16 product compute
# modules (calendar, todo, chat, picture, planta, food, news,
# traces, moodlit, presi, music, contacts, storage, context, guides,
# research, who) on a single port. Replaces ~17 per-product backend
# containers from the pre-consolidation era; the unified Mana web
# app's compute calls all flow through here.
mana-api:
build:
context: .
@ -1821,17 +1452,7 @@ services:
volumes:
redis_data:
name: mana-redis-data
victoriametrics_data:
name: mana-victoria-data
alertmanager_data:
name: mana-alertmanager-data
grafana_data:
name: mana-grafana-data
analytics_data:
name: mana-analytics-data
loki_data:
name: mana-loki-data
stalwart_data:
name: mana-stalwart-data
tempo_data:
name: mana-tempo-data