From 698e09b88cce7a9e7be635eead9f60cce86241e0 Mon Sep 17 00:00:00 2001 From: Till JS Date: Tue, 28 Apr 2026 16:10:31 +0200 Subject: [PATCH] chore(deploy): auto-apply additive Drizzle schema migrations + RAM headroom for mana-web build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two CD-pipeline ergonomics fixes that surfaced during the 2026-04-28 schema-drift sweep. (C) Auto-apply additive Drizzle migrations ======================================== 8 services use Drizzle (mana-auth/-credits/-events/-research/-mail/ -subscriptions/-user/-analytics) but the CD pipeline never ran their `db:push` script, so 4 schema additions stayed undeployed for days (auth.users.kind, credits.{sync_subscriptions,reservations}, event_discovery.*) until live PostgresErrors surfaced them. New `scripts/mac-mini/safe-db-push.sh`: - Uses `drizzle-kit generate` to write a probe SQL file (does NOT apply yet). - Greps the generated SQL for destructive patterns (DROP TABLE/ COLUMN/TYPE/SCHEMA/INDEX, ALTER COLUMN ... TYPE, RENAME). - Refuses to auto-apply if any are found — operator must review and run `pnpm db:push --force` manually after pg_dump. - Otherwise applies via `drizzle-kit push --force` and cleans up the probe artifacts. CD step "Apply schema migrations" runs between build and container restart, sourcing each changed service's DATABASE_URL from compose config (with @postgres → @localhost rewrite for the host runner). Failure aborts deploy before the new container starts — the old container keeps running with the old schema, which matches. (D) Build-time RAM headroom ======================================== mana-web's Vite build needs 8 GiB of Node heap; Colima's VM is sized at 12 GiB; ~3.5 GiB of other containers run during deploy. The 2026- 04-28 mana-web deploy OOM'd at the Vite step ("cannot allocate memory") and only succeeded on retry once concurrent traffic settled. New `scripts/mac-mini/build-memory-headroom.sh`: - `start`: stops every container matching `^mana-mon-` (the observability stack — VictoriaMetrics, Loki, Glitchtip, cAdvisor, umami, blackbox, exporters). Frees ~700 MiB. - `stop`: restores them from the snapshot list captured at start. - `wrap `: pause + run + always-resume via trap. CD wraps the build loop with start/stop, but only when mana-web is in the change set — other services build well below 4 GiB and don't need the headroom. The monitoring stack resumes before the migration step so cAdvisor + exporters are back online for the deploy-metrics collection. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/cd-macmini.yml | 56 ++++++++++ scripts/mac-mini/build-memory-headroom.sh | 103 ++++++++++++++++++ scripts/mac-mini/safe-db-push.sh | 124 ++++++++++++++++++++++ 3 files changed, 283 insertions(+) create mode 100755 scripts/mac-mini/build-memory-headroom.sh create mode 100755 scripts/mac-mini/safe-db-push.sh diff --git a/.github/workflows/cd-macmini.yml b/.github/workflows/cd-macmini.yml index 99c59f910..ead03e74a 100644 --- a/.github/workflows/cd-macmini.yml +++ b/.github/workflows/cd-macmini.yml @@ -290,6 +290,24 @@ jobs: echo "=== Rebuilding: $SERVICES ===" fi + # mana-web's Vite build needs 8 GiB of Node heap and Colima's + # VM is sized at 12 GiB. With ~3.5 GiB of other containers + # running, peak RSS occasionally OOMs the build (we hit this + # on 2026-04-28). Pause the non-critical monitoring stack + # for the duration of the build to free ~700 MiB of headroom; + # the trap inside the wrapper restores it on exit, even on + # build failure. No-op if mana-web isn't in $SERVICES. + PAUSE_MONITORING=false + if echo " $SERVICES " | grep -q ' mana-web '; then + PAUSE_MONITORING=true + echo "=== Pausing monitoring stack (mana-web build needs RAM headroom) ===" + ./scripts/mac-mini/build-memory-headroom.sh start + fi + # Resume monitoring no matter how the build phase exits. + if [ "$PAUSE_MONITORING" = "true" ]; then + trap './scripts/mac-mini/build-memory-headroom.sh stop' EXIT + fi + # Build each service individually to capture build times BUILD_TIMES="" for svc in $SERVICES; do @@ -302,6 +320,44 @@ jobs: echo " $svc built in ${build_dur}s" done + # Resume monitoring before the migration / start steps run — + # they need cAdvisor + exporters back online to record the + # deploy metrics step further down. + if [ "$PAUSE_MONITORING" = "true" ]; then + ./scripts/mac-mini/build-memory-headroom.sh stop + trap - EXIT + fi + + # Apply Drizzle schema migrations BEFORE we restart the + # service containers — additive-only, see + # scripts/mac-mini/safe-db-push.sh for the destructive guard. + # If a service has no Drizzle config or no schema diff this is + # a fast no-op. We must source POSTGRES_PASSWORD from the env + # file because the workflow env doesn't carry it. + echo "=== Applying schema migrations ===" + set -a + # shellcheck source=/dev/null + . "$ENV_FILE" + set +a + PG_PASSWORD="${POSTGRES_PASSWORD:-mana123}" + # Most services live in mana_platform; mana-sync (Go, no + # Drizzle) and a handful of others use mana_sync. Per-service + # routing is read straight from compose's DATABASE_URL env. + for svc in $SERVICES; do + # Pull the literal DATABASE_URL from the compose definition, + # then swap host postgres → localhost (we run on the host, + # not inside the docker network). + db_url=$(docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" config "$svc" 2>/dev/null \ + | awk '/DATABASE_URL:/ {print $2; exit}' \ + | sed 's|@postgres:|@localhost:|') + if [ -z "$db_url" ]; then continue; fi + DATABASE_URL="$db_url" PROJECT_DIR="${{ env.PROJECT_DIR }}" \ + ./scripts/mac-mini/safe-db-push.sh "$svc" || { + echo "[deploy] safe-db-push failed for $svc — aborting before restart" + exit 1 + } + done + # Start all services at once (no rebuild, images already built) echo "=== Starting services ===" if [ "$DEPLOY_ALL" == "true" ]; then diff --git a/scripts/mac-mini/build-memory-headroom.sh b/scripts/mac-mini/build-memory-headroom.sh new file mode 100755 index 000000000..36e5aee81 --- /dev/null +++ b/scripts/mac-mini/build-memory-headroom.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# +# Frees RAM in the Colima VM by pausing the non-critical monitoring +# stack while a heavy build (mana-web's Vite bundler) runs. The stack +# resumes via a trap on script exit, so a failed build still restores +# observability. +# +# Why this exists: the unified mana-web Vite build needs 8 GB of Node +# heap (NODE_OPTIONS="--max-old-space-size=8192" in the Dockerfile). +# Colima's VM is sized at 12 GB; with ~3.5 GiB of other containers +# running and BuildKit's own overhead, peak RSS occasionally spills +# over and the build OOMs with "cannot allocate memory" mid-Vite. The +# 2026-04-28 mana-web deploy hit this, then succeeded on retry once +# concurrent traffic settled. +# +# What we pause: every container with a `mana-mon-*` name. That's the +# observability stack (VictoriaMetrics, Loki, Glitchtip, cAdvisor, +# umami, blackbox, exporters, …) — combined ~700 MiB resident, large +# enough to give Vite the headroom it needs without touching anything +# load-bearing. Postgres, Redis, the auth/api/web tier, all stay up. +# +# Usage: +# scripts/mac-mini/build-memory-headroom.sh start # pause monitoring +# scripts/mac-mini/build-memory-headroom.sh stop # resume monitoring +# +# Or wrap a command: +# scripts/mac-mini/build-memory-headroom.sh wrap docker compose build mana-web +# +# Designed to be idempotent: pausing already-stopped containers is a +# no-op, and the resume step skips containers that aren't part of the +# monitoring stack. + +set -euo pipefail + +PROJECT_DIR="${PROJECT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}" +COMPOSE_FILE="${COMPOSE_FILE:-docker-compose.macmini.yml}" +ENV_FILE="${ENV_FILE:-.env.macmini}" + +# Pattern for "non-critical" containers — pause these to free RAM. +# Anything outside this pattern is left alone (load-bearing services). +NON_CRITICAL_PATTERN='^mana-mon-' + +list_paused_targets() { + # Currently-running containers matching the pattern. We capture the + # list at pause-time so resume only touches what we actually paused. + docker ps --format '{{.Names}}' | grep -E "$NON_CRITICAL_PATTERN" || true +} + +pause_monitoring() { + cd "$PROJECT_DIR" + local targets + targets=$(list_paused_targets) + if [ -z "$targets" ]; then + echo "[build-memory-headroom] no monitoring containers running — nothing to pause" + return 0 + fi + echo "[build-memory-headroom] pausing for build headroom: $(echo "$targets" | tr '\n' ' ')" + # Persist the list so `stop` knows what to bring back even if the + # environment between calls is fresh (CI step boundary). + mkdir -p /tmp/mana-deploy + echo "$targets" > /tmp/mana-deploy/paused-monitoring.txt + # `docker stop` is graceful (SIGTERM, then SIGKILL after 10 s). + # We don't need data integrity for stateless monitoring; quick + # stop is fine. + # shellcheck disable=SC2086 + docker stop --time=5 $targets >/dev/null +} + +resume_monitoring() { + cd "$PROJECT_DIR" + if [ ! -f /tmp/mana-deploy/paused-monitoring.txt ]; then + echo "[build-memory-headroom] no record of paused containers — nothing to resume" + return 0 + fi + local targets + targets=$(cat /tmp/mana-deploy/paused-monitoring.txt) + if [ -z "$targets" ]; then return 0; fi + echo "[build-memory-headroom] resuming: $(echo "$targets" | tr '\n' ' ')" + # `docker start` ignores already-running and missing containers + # silently — best-effort restore. + # shellcheck disable=SC2086 + docker start $targets >/dev/null 2>&1 || true + rm -f /tmp/mana-deploy/paused-monitoring.txt +} + +case "${1:-}" in + start | pause) + pause_monitoring + ;; + stop | resume) + resume_monitoring + ;; + wrap) + shift + pause_monitoring + trap resume_monitoring EXIT INT TERM + "$@" + ;; + *) + echo "Usage: $0 {start|stop|wrap }" >&2 + exit 1 + ;; +esac diff --git a/scripts/mac-mini/safe-db-push.sh b/scripts/mac-mini/safe-db-push.sh new file mode 100755 index 000000000..f143623e1 --- /dev/null +++ b/scripts/mac-mini/safe-db-push.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# +# Safely apply Drizzle schema changes to the prod Postgres for a single +# service. +# +# Why this exists: every Drizzle-using service has a `db:push` script +# but the CD pipeline never ran it, so schema changes drifted silently +# between the typed Drizzle definition and the live database. Today's +# audit found four such drifts (auth.users.kind, credits.sync_subscriptions, +# credits.reservations, event_discovery.*) — all additive, all easily +# applied once detected, but they should never have stayed undetected. +# +# How it works: +# 1. `drizzle-kit generate` produces a SQL diff file under the +# service's `drizzle/` dir without applying it. +# 2. We grep the generated SQL for destructive patterns. If any are +# found, we ABORT and refuse to apply — the operator must review +# and run `drizzle-kit push --force` manually. +# 3. If only additive changes are present, we run `drizzle-kit push +# --force` to apply them. Then delete the generated marker file +# so it doesn't pile up in the repo. +# +# Destructive patterns we refuse to auto-apply: +# - DROP TABLE / DROP COLUMN / DROP TYPE / DROP SCHEMA / DROP INDEX +# - ALTER COLUMN ... TYPE (change column type — usually data-loss) +# - RENAME COLUMN / RENAME TABLE (data still there, but breaking +# change for any caller pinned to the old name) +# +# Usage: scripts/mac-mini/safe-db-push.sh +# Env requirements: +# - DATABASE_URL: connection string to apply migrations against +# - PROJECT_DIR : repo root (the deploy workflow sets this) + +set -euo pipefail + +SVC="${1:?usage: $0 }" +PROJECT_DIR="${PROJECT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}" +SVC_DIR="$PROJECT_DIR/services/$SVC" + +if [ ! -d "$SVC_DIR" ]; then + echo "[safe-db-push] $SVC: no service dir at $SVC_DIR — skipping" + exit 0 +fi +if [ ! -f "$SVC_DIR/drizzle.config.ts" ] && [ ! -f "$SVC_DIR/drizzle.config.js" ]; then + echo "[safe-db-push] $SVC: no drizzle config — skipping" + exit 0 +fi +if ! grep -q '"db:push"' "$SVC_DIR/package.json" 2>/dev/null; then + echo "[safe-db-push] $SVC: no db:push script — skipping" + exit 0 +fi + +if [ -z "${DATABASE_URL:-}" ]; then + echo "[safe-db-push] $SVC: DATABASE_URL not set — skipping" + exit 0 +fi + +cd "$SVC_DIR" + +# Snapshot the existing migration set before we generate. Anything new +# afterwards is the diff this push would apply. +PRE_GEN_FILES=$(find drizzle -maxdepth 2 -name '*.sql' 2>/dev/null | sort || true) + +# Generate-only — does not touch the database. +echo "[safe-db-push] $SVC: generating diff…" +GEN_OUT=$(pnpm exec drizzle-kit generate --name "__ci_safety_check_$$" 2>&1 || true) +echo "$GEN_OUT" | tail -20 + +POST_GEN_FILES=$(find drizzle -maxdepth 2 -name '*.sql' 2>/dev/null | sort || true) + +# New SQL files = the diff +NEW_SQL=$(comm -13 <(echo "$PRE_GEN_FILES") <(echo "$POST_GEN_FILES") | grep -v '^$' || true) + +if [ -z "$NEW_SQL" ]; then + echo "[safe-db-push] $SVC: no schema changes — clean." + exit 0 +fi + +echo "[safe-db-push] $SVC: schema diff detected:" +echo "$NEW_SQL" + +# Trap so we always remove the generated probe files, even on failure. +cleanup() { + for f in $NEW_SQL; do + rm -f "$f" + done + # drizzle-kit also writes a meta entry; remove the most recent one. + if [ -f drizzle/meta/_journal.json ]; then + # Best-effort cleanup — strip the entry that references our probe tag. + # If jq isn't available, leave it; the next legitimate `db:push` will + # overwrite anyway. + if command -v jq >/dev/null 2>&1; then + tmp=$(mktemp) + jq '.entries |= map(select(.tag | test("__ci_safety_check") | not))' \ + drizzle/meta/_journal.json > "$tmp" && mv "$tmp" drizzle/meta/_journal.json || true + fi + fi +} +trap cleanup EXIT + +# Refuse to auto-apply destructive changes. The operator must review +# and either fix the schema (if the diff was unintentional) or run +# `drizzle-kit push --force` manually after taking a fresh pg_dump. +DESTRUCTIVE_PATTERN='DROP[[:space:]]+(TABLE|COLUMN|TYPE|SCHEMA|INDEX|VIEW|FUNCTION)|ALTER[[:space:]]+TABLE.*ALTER[[:space:]]+COLUMN.*TYPE|RENAME[[:space:]]+(COLUMN|TABLE|TO)' + +DESTRUCTIVE_HITS="" +for sql in $NEW_SQL; do + hits=$(grep -niE "$DESTRUCTIVE_PATTERN" "$sql" || true) + if [ -n "$hits" ]; then + DESTRUCTIVE_HITS="$DESTRUCTIVE_HITS\n=== $sql ===\n$hits" + fi +done + +if [ -n "$DESTRUCTIVE_HITS" ]; then + echo "[safe-db-push] $SVC: ✗ DESTRUCTIVE changes detected — refusing to auto-apply" + echo " Review the diff and run \`pnpm db:push --force\` manually after backup." + echo -e "$DESTRUCTIVE_HITS" + exit 1 +fi + +# Additive only — safe to apply. +echo "[safe-db-push] $SVC: ✓ additive only, applying…" +pnpm exec drizzle-kit push --force +echo "[safe-db-push] $SVC: ✓ schema is now in sync"