diff --git a/cloudflared-config.yml b/cloudflared-config.yml index 531c5842c..6d561a6ee 100644 --- a/cloudflared-config.yml +++ b/cloudflared-config.yml @@ -132,8 +132,6 @@ ingress: # ============================================ # Forgejo (Git + CI/CD) # ============================================ - - hostname: git.mana.how - service: http://localhost:3041 # ============================================ # Standalone microservices @@ -215,12 +213,6 @@ ingress: # ============================================ # Monitoring & observability # ============================================ - - hostname: grafana.mana.how - service: http://localhost:8000 - - hostname: stats.mana.how - service: http://localhost:8010 - - hostname: glitchtip.mana.how - service: http://localhost:8020 # ============================================ # GPU services (NOT in this tunnel) diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index 3e74542b5..476cea4d5 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -202,56 +202,6 @@ services: # Tier 0b: Forgejo (Git + CI/CD + Registry) # ============================================ - forgejo: - image: codeberg.org/forgejo/forgejo:11 - container_name: mana-core-forgejo - restart: always - mem_limit: 512m - depends_on: - postgres: - condition: service_healthy - environment: - USER_UID: 1000 - USER_GID: 1000 - FORGEJO__database__DB_TYPE: postgres - FORGEJO__database__HOST: postgres:5432 - FORGEJO__database__NAME: forgejo - FORGEJO__database__USER: postgres - FORGEJO__database__PASSWD: ${POSTGRES_PASSWORD:-mana123} - FORGEJO__server__DOMAIN: git.mana.how - FORGEJO__server__SSH_DOMAIN: git.mana.how - FORGEJO__server__ROOT_URL: https://git.mana.how/ - FORGEJO__server__HTTP_PORT: 3000 - FORGEJO__server__SSH_PORT: 2222 - FORGEJO__server__LFS_START_SERVER: "true" - FORGEJO__service__DISABLE_REGISTRATION: "true" - FORGEJO__service__REQUIRE_SIGNIN_VIEW: "false" - FORGEJO__actions__ENABLED: "true" - FORGEJO__actions__DEFAULT_ACTIONS_URL: https://code.forgejo.org - FORGEJO__packages__ENABLED: "true" - FORGEJO__ui__DEFAULT_THEME: forgejo-dark - FORGEJO__ui__SHOW_USER_EMAIL: "false" - FORGEJO__mailer__ENABLED: "false" - volumes: - - /Volumes/ManaData/forgejo:/data - ports: - - "3041:3000" - - "2222:22" - healthcheck: - test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/api/v1/version"] - interval: 120s - timeout: 10s - retries: 3 - start_period: 30s - - # Forgejo runner removed — no macOS binary exists, Docker-based runner - # can't access host filesystem/SSH for CD. GitHub CD handles deployment - # via native self-hosted runner. Forgejo is kept as a mirror only. - - # ============================================ - # Tier 1: Core Auth Service (Port 3001) - # ============================================ - mana-auth: build: context: . @@ -1281,163 +1231,6 @@ services: # Tier 7: Monitoring Dashboards (Ports 8000-8099) # ============================================ - grafana: - image: grafana/grafana:10.4.1 - container_name: mana-mon-grafana - restart: always - mem_limit: 192m - depends_on: - victoriametrics: - condition: service_healthy - environment: - GF_SECURITY_ADMIN_USER: admin - GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD:-admin} - GF_USERS_ALLOW_SIGN_UP: false - GF_AUTH_ANONYMOUS_ENABLED: true - GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer - GF_SERVER_ROOT_URL: https://grafana.mana.how - GF_SERVER_HTTP_PORT: 8000 - GF_INSTALL_PLUGINS: yesoreyeram-infinity-datasource - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH: /var/lib/grafana/dashboards/master-overview.json - volumes: - - ./docker/grafana/provisioning:/etc/grafana/provisioning:ro - - ./docker/grafana/dashboards:/var/lib/grafana/dashboards:ro - - grafana_data:/var/lib/grafana - ports: - - "8000:8000" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8000/api/health"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 10s - - umami: - # Pinned away from postgresql-latest on 2026-04-23. The rolling - # tag jumped to Umami 3.1.0 (Next.js 16) and started crashing the - # container on every POST /api/send — page loaders hung on the - # failing tracker request. v2.18.0 is the last known-stable v2. - # Rolling back to v2 was safe here because the schema is shared - # across 2.x. If you bump to v3 again, verify the DB migration - # path and test /api/send with a real POST before committing. - image: ghcr.io/umami-software/umami:postgresql-v2.18.0 - container_name: mana-mon-umami - restart: always - mem_limit: 384m - depends_on: - postgres: - condition: service_healthy - environment: - DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD:-mana123}@postgres:5432/umami - DATABASE_TYPE: postgresql - APP_SECRET: ${UMAMI_APP_SECRET:-change-me-umami-secret} - DISABLE_TELEMETRY: 1 - ports: - - "8010:3000" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3000/api/heartbeat"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 30s - - # ============================================ - # Tier 8: Metrics & Exporters (Ports 9000-9199) - # ============================================ - - victoriametrics: - image: victoriametrics/victoria-metrics:v1.99.0 - container_name: mana-mon-victoria - restart: always - mem_limit: 384m - # Mount the host config dir read-only and point promscrape directly at it, - # so edits to docker/prometheus/prometheus.yml are picked up by POST /-/reload - # without a container restart. The previous setup baked a copy into - # /etc/prometheus/ at startup, which silently drifted from the host file - # whenever the container wasn't restarted (matrix removal incident, 2026-04-08). - entrypoint: ["/victoria-metrics-prod", "-storageDataPath=/storage", "-retentionPeriod=2y", "-httpListenAddr=:9090", "-promscrape.config=/etc/prometheus/prometheus.yml", "-promscrape.config.strictParse=false", "-selfScrapeInterval=15s", "-search.latencyOffset=0s"] - volumes: - - ./docker/prometheus:/etc/prometheus:ro - - victoriametrics_data:/storage - ports: - - "9090:9090" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9090/health"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 10s - - tempo: - image: grafana/tempo:2.6.1 - container_name: mana-mon-tempo - restart: always - mem_limit: 256m - command: ["-config.file=/etc/tempo/tempo.yaml"] - volumes: - - ./docker/tempo:/etc/tempo:ro - - tempo_data:/var/tempo - ports: - - "4318:4318" # OTLP HTTP receiver - - "3200:3200" # Tempo API (for Grafana) - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3200/ready"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 10s - - loki: - image: grafana/loki:3.0.0 - container_name: mana-mon-loki - restart: always - mem_limit: 192m - entrypoint: ["sh", "-c", "mkdir -p /etc/loki && cp /mnt/loki-config/*.yaml /etc/loki/ 2>/dev/null; exec /usr/bin/loki -config.file=/etc/loki/local-config.yaml"] - volumes: - - ./docker/loki:/mnt/loki-config:ro - - loki_data:/loki - ports: - - "3100:3100" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3100/ready"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 15s - - promtail: - image: grafana/promtail:3.0.0 - container_name: mana-mon-promtail - restart: always - mem_limit: 96m - command: -config.file=/etc/promtail/config.yaml -config.expand-env=true - volumes: - - ./docker/promtail:/etc/promtail:ro - - /var/run/docker.sock:/var/run/docker.sock:ro - depends_on: - loki: - condition: service_started - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9080/ready"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 10s - - pushgateway: - image: prom/pushgateway:v1.7.0 - container_name: mana-mon-pushgateway - restart: always - mem_limit: 48m - ports: - - "9091:9091" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9091/-/healthy"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 20s - cadvisor: image: gcr.io/cadvisor/cadvisor:v0.49.1 container_name: mana-mon-cadvisor @@ -1541,112 +1334,6 @@ services: sleep 60 done - blackbox-exporter: - image: prom/blackbox-exporter:v0.25.0 - container_name: mana-mon-blackbox - restart: always - mem_limit: 128m - # Use Cloudflare + Google public resolvers instead of Docker's - # embedded DNS (127.0.0.11). Docker DNS forwards to the host - # resolver which forwards to the home router (FRITZ!Box), and the - # router keeps a stale negative cache for hours after a hostname - # first fails. New CNAMEs (e.g. fresh GPU public hostnames added - # via the Cloudflare dashboard) appear as "no such host" to the - # blackbox probes for the entire negative-cache TTL even though - # they resolve fine via 1.1.1.1 directly. - dns: - - 1.1.1.1 - - 8.8.8.8 - command: ["--config.file=/etc/blackbox/blackbox.yml"] - volumes: - - ./docker/blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro - ports: - - "9115:9115" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9115/"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 10s - - # ============================================ - # Alerting Stack (Ports 9093-9095) - # ============================================ - - vmalert: - image: victoriametrics/vmalert:v1.99.0 - container_name: mana-mon-vmalert - restart: always - mem_limit: 64m - depends_on: - victoriametrics: - condition: service_healthy - alertmanager: - condition: service_healthy - # Same direct-mount pattern as victoriametrics above — see the comment - # there for the rationale. - entrypoint: ["/vmalert-prod", "-datasource.url=http://victoriametrics:9090", "-notifier.url=http://alertmanager:9093", "-remoteWrite.url=http://victoriametrics:9090", "-remoteRead.url=http://victoriametrics:9090", "-rule=/etc/alerts/alerts.yml", "-evaluationInterval=30s", "-httpListenAddr=:8880"] - volumes: - - ./docker/prometheus:/etc/alerts:ro - ports: - - "8880:8880" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8880/health"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 25s - - alertmanager: - image: prom/alertmanager:v0.27.0 - container_name: mana-mon-alertmanager - restart: always - mem_limit: 64m - depends_on: - alert-notifier: - condition: service_healthy - command: ["--config.file=/etc/alertmanager/alertmanager.yml", "--storage.path=/alertmanager", "--web.listen-address=:9093"] - volumes: - - ./docker/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro - - alertmanager_data:/alertmanager - ports: - - "9093:9093" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9093/-/healthy"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 25s - - alert-notifier: - build: - context: ./docker/alert-notifier - dockerfile: Dockerfile - image: alert-notifier:local - container_name: mana-mon-alert-notifier - restart: always - # Tier-3 right-size 2026-04-28: live RSS ~25 MiB (79%) — at OOM - # risk during alert-burst when many alerts queue at once. Bumped - # to 48m. - mem_limit: 48m - environment: - PORT: 8080 - TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN:-} - TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID:-} - NTFY_TOPIC: ${NTFY_TOPIC:-} - ports: - - "9095:8080" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"] - interval: 300s - timeout: 5s - retries: 3 - start_period: 25s - - # ============================================ - # Auto-Update (Watchtower) - # ============================================ - watchtower: image: nickfedor/watchtower:latest container_name: mana-auto-watchtower @@ -1669,62 +1356,6 @@ services: # GlitchTip Error Tracking (Sentry-compatible) # ============================================ - glitchtip: - image: glitchtip/glitchtip:latest - container_name: mana-mon-glitchtip - restart: always - mem_limit: 384m - environment: - DATABASE_URL: postgres://postgres:${POSTGRES_PASSWORD:-mana123}@postgres:5432/glitchtip - REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/1 - SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change-me-in-production} - PORT: "8020" - GLITCHTIP_DOMAIN: https://glitchtip.mana.how - DEFAULT_FROM_EMAIL: glitchtip@mana.how - CELERY_WORKER_AUTOSCALE: "1,3" - ENABLE_USER_REGISTRATION: "true" - ports: - - "8020:8020" - depends_on: - postgres: - condition: service_healthy - redis: - condition: service_healthy - healthcheck: - test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8020/_health/')"] - interval: 300s - timeout: 10s - retries: 3 - start_period: 30s - - glitchtip-worker: - image: glitchtip/glitchtip:latest - container_name: mana-mon-glitchtip-worker - restart: always - mem_limit: 192m - command: ./bin/run-celery-with-beat.sh - environment: - DATABASE_URL: postgres://postgres:${POSTGRES_PASSWORD:-mana123}@postgres:5432/glitchtip - REDIS_URL: redis://:${REDIS_PASSWORD:-redis123}@redis:6379/1 - SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change-me-in-production} - GLITCHTIP_DOMAIN: https://glitchtip.mana.how - CELERY_WORKER_AUTOSCALE: "1,3" - depends_on: - postgres: - condition: service_healthy - redis: - condition: service_healthy - - # ============================================ - # Unified API Server - # ============================================ - # apps/api — Hono/Bun process that hosts all 16 product compute - # modules (calendar, todo, chat, picture, planta, food, news, - # traces, moodlit, presi, music, contacts, storage, context, guides, - # research, who) on a single port. Replaces ~17 per-product backend - # containers from the pre-consolidation era; the unified Mana web - # app's compute calls all flow through here. - mana-api: build: context: . @@ -1821,17 +1452,7 @@ services: volumes: redis_data: name: mana-redis-data - victoriametrics_data: - name: mana-victoria-data - alertmanager_data: - name: mana-alertmanager-data - grafana_data: - name: mana-grafana-data analytics_data: name: mana-analytics-data - loki_data: - name: mana-loki-data stalwart_data: name: mana-stalwart-data - tempo_data: - name: mana-tempo-data