From 77b2d1eb32dfab918a6724c17e28cdfb535198c9 Mon Sep 17 00:00:00 2001 From: Till JS Date: Thu, 9 Apr 2026 17:52:40 +0200 Subject: [PATCH] =?UTF-8?q?chore(infra):=20smarter=20tunnel=20rebuild=20?= =?UTF-8?q?=E2=80=94=20apex=20via=20API=20+=20sane=20probes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two improvements to scripts/mac-mini/rebuild-tunnel.sh based on what the first prod run actually surfaced. ═══ 1. Apex domain auto-fix via Cloudflare API ═══ `cloudflared tunnel route dns` cannot route the apex of a zone (error code 1003: "An A, AAAA, or CNAME record with that host already exists"). The CLI has no command to delete those records. The first rebuild left mana.how returning 530 because the script silently failed to route it and we had to fix the apex manually in the dashboard. The new `apex_route_via_api()` helper: - Detects apex hostnames by dot count (one dot → two-label name) - Uses $CLOUDFLARE_API_TOKEN if available - Resolves the zone id by name - Deletes any existing A / AAAA / CNAME records on the apex - Creates a fresh proxied CNAME pointing at .cfargotunnel.com - Cloudflare's CNAME flattening at the apex makes this work transparently If $CLOUDFLARE_API_TOKEN is not set, the script logs a warning at the top of step 6 and falls back to the old behavior (route fails, user fixes the apex manually). The token needs Zone:DNS:Edit on the target zone. ═══ 2. Smarter HTTP verification ═══ The first run reported "5 hosts down (404/000)" but those were all backend services without a root handler — credits/media/llm/mana-api all return 404 at `/` and 200 at `/health`. The verify pass was flagging healthy services as down and made the rebuild look more broken than it was. New `probe_host()` tries `/health` first, falls back to `/` only if /health returned 4xx, and prefers a 2xx/3xx root response over a 4xx /health. `probe_is_down()` only counts 5xx and 000 (libcurl error) as failures — anything in 1xx-4xx means the request reached the origin and the tunnel routing is correct, which is the actual thing the verify pass cares about. `probe_label()` adds a one-word health summary so the verify log reads "200 ok" / "401 auth required" / "404 routed (no handler)" / "530 tunnel error" instead of just bare status codes. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/mac-mini/rebuild-tunnel.sh | 139 ++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 4 deletions(-) diff --git a/scripts/mac-mini/rebuild-tunnel.sh b/scripts/mac-mini/rebuild-tunnel.sh index 78883de40..cd9f16a1c 100755 --- a/scripts/mac-mini/rebuild-tunnel.sh +++ b/scripts/mac-mini/rebuild-tunnel.sh @@ -68,6 +68,130 @@ log() { echo -e "${GREEN}[$(date +%H:%M:%S)]${NC} $*"; } warn() { echo -e "${YELLOW}[$(date +%H:%M:%S)] WARN${NC} $*"; } err() { echo -e "${RED}[$(date +%H:%M:%S)] ERROR${NC} $*"; } +# ─── HTTP probe helpers ──────────────────────────────────── +# +# A naive `curl /` check produces tons of false positives because +# many backend services don't have a root handler — mana-credits, +# mana-llm, media etc. all return 404 at `/` but are perfectly +# healthy at `/health`. The grafana installation has `/api/health` +# instead of `/health`. Forgejo (git) has neither and serves the +# repo browser at root. +# +# `probe_host` tries `/health` first — that's the convention for +# every Mana service that has one — and falls back to `/` if /health +# returns a 4xx. The combined "best" status is returned. +# +# `probe_is_down` decides what counts as a real failure: only 5xx +# Cloudflare/origin errors and the libcurl 000 (couldn't connect / +# DNS error / timeout) count as down. Anything in the 1xx-4xx range +# means the request reached the origin and got a structured reply, +# which is what we care about for "the tunnel is routing correctly". +# +# `probe_label` formats a one-word health summary for the verify log +# so the output reads "200 ok" / "401 auth" / "404 routed" / etc. +probe_host() { + local host=$1 + local code + code=$(curl -s -o /dev/null -m 6 -w "%{http_code}" "https://$host/health" 2>/dev/null || echo "000") + if [ "$code" -ge 400 ] 2>/dev/null && [ "$code" -lt 500 ]; then + # /health returned 4xx — probably no /health handler. Fall back to / + local code_root + code_root=$(curl -s -o /dev/null -m 6 -w "%{http_code}" "https://$host" 2>/dev/null || echo "000") + # Prefer the root probe if it returned a 2xx/3xx (real success) + case "$code_root" in + 2*|3*) code=$code_root ;; + esac + fi + echo "$code" +} + +probe_is_down() { + local code=$1 + case "$code" in + 000|5*) return 0 ;; # 000 = curl error, 5xx = server/tunnel error + *) return 1 ;; # everything else = reached the origin + esac +} + +probe_label() { + local code=$1 + case "$code" in + 200|204) echo "ok" ;; + 301|302|307|308) echo "redirect (auth gate)" ;; + 401) echo "auth required" ;; + 403) echo "forbidden" ;; + 404) echo "routed (no handler)" ;; + 4*) echo "client error" ;; + 502) echo "bad gateway (origin down)" ;; + 503) echo "unavailable" ;; + 530) echo "tunnel error" ;; + 5*) echo "server error" ;; + 000) echo "unreachable" ;; + *) echo "?" ;; + esac +} + +# ─── Apex DNS via Cloudflare API ─────────────────────────── +# +# `cloudflared tunnel route dns` cannot route the apex of a zone +# (e.g. `mana.how`) because Cloudflare requires the apex to be a +# CNAME and refuses to create one when A/AAAA records already exist +# (error code 1003). The CLI has no command to delete those records. +# +# Workaround: if $CLOUDFLARE_API_TOKEN is set, this function uses the +# Cloudflare REST API to: +# 1. Resolve the zone id by name +# 2. Find any existing A / AAAA / CNAME records for the hostname +# 3. Delete them +# 4. Create a fresh proxied CNAME pointing at the tunnel's +# `.cfargotunnel.com` target +# +# The token needs `Zone:DNS:Edit` permission for the target zone. +# Cloudflare's CNAME flattening at the apex makes this work +# transparently — the apex resolves to Cloudflare anycast IPs as +# usual, but is internally a CNAME that follows the tunnel. +# +# Returns 0 on success, 1 if no token / non-apex / API error. +apex_route_via_api() { + local hostname=$1 + local tunnel_id=$2 + + # Apex check: exactly one dot in the hostname (e.g. mana.how, not + # chat.mana.how). cloudflared handles non-apex hostnames fine via + # `tunnel route dns` so we never need to call the API for those. + local dot_count=$(echo "$hostname" | tr -cd '.' | wc -c | tr -d ' ') + [ "$dot_count" = "1" ] || return 1 + + [ -n "${CLOUDFLARE_API_TOKEN:-}" ] || return 1 + + local api="https://api.cloudflare.com/client/v4" + local auth="Authorization: Bearer $CLOUDFLARE_API_TOKEN" + + local zone_id + zone_id=$(curl -sf -H "$auth" "$api/zones?name=$hostname" 2>/dev/null \ + | jq -r '.result[0].id // empty') + [ -n "$zone_id" ] || { warn " apex API: zone not found for $hostname"; return 1; } + + # Delete any existing A / AAAA / CNAME records on the apex + local existing_ids + existing_ids=$(curl -sf -H "$auth" "$api/zones/$zone_id/dns_records?name=$hostname" 2>/dev/null \ + | jq -r '.result[] | select(.type == "A" or .type == "AAAA" or .type == "CNAME") | .id') + + for rid in $existing_ids; do + curl -sf -X DELETE -H "$auth" "$api/zones/$zone_id/dns_records/$rid" >/dev/null 2>&1 \ + || { warn " apex API: delete of record $rid failed"; return 1; } + done + + # Create a fresh proxied CNAME at the apex pointing at the tunnel + local target="$tunnel_id.cfargotunnel.com" + local resp + resp=$(curl -sf -X POST -H "$auth" -H "Content-Type: application/json" \ + "$api/zones/$zone_id/dns_records" \ + -d "{\"type\":\"CNAME\",\"name\":\"$hostname\",\"content\":\"$target\",\"proxied\":true,\"ttl\":1}" 2>/dev/null) + [ -n "$resp" ] || { warn " apex API: create CNAME failed"; return 1; } + return 0 +} + # ─── Pre-flight checks ───────────────────────────────────── [ -x "$CLOUDFLARED" ] || { err "$CLOUDFLARED not found or not executable"; exit 1; } @@ -115,7 +239,7 @@ BASELINE_FILE="$BACKUP_DIR/baseline-http-statuses.txt" : > "$BASELINE_FILE" for host in $HOSTNAMES; do [ "$host" = "ssh.mana.how" ] && continue # SSH-only, no HTTP - code=$(curl -s -o /dev/null -m 5 -w "%{http_code}" "https://$host" 2>/dev/null || echo "000") + code=$(probe_host "$host") printf "%-35s %s\n" "$host" "$code" | tee -a "$BASELINE_FILE" done @@ -166,11 +290,17 @@ log " ✓ ingress validate passed" # ─── Step 6: DNS routes ──────────────────────────────────── log "Step 6/8: Routing $HOSTNAME_COUNT hostnames at the new tunnel..." +if [ -z "${CLOUDFLARE_API_TOKEN:-}" ]; then + warn " CLOUDFLARE_API_TOKEN not set — apex domains (e.g. mana.how) cannot be auto-fixed" + warn " Set it before running the script for fully unattended apex routing." +fi ROUTE_FAILS=0 for host in $HOSTNAMES; do [ "$host" = "ssh.mana.how" ] && continue # cloudflared tunnel route dns is for HTTP/TCP, ssh is special if $CLOUDFLARED tunnel route dns -f "$TUNNEL_NAME" "$host" 2>/dev/null; then printf " ✓ %s\n" "$host" + elif apex_route_via_api "$host" "$NEW_TUNNEL_ID"; then + printf " ✓ %s (via Cloudflare API — apex)\n" "$host" else printf " ✗ %s (route failed)\n" "$host" ROUTE_FAILS=$((ROUTE_FAILS + 1)) @@ -204,9 +334,10 @@ VERIFY_FILE="$BACKUP_DIR/post-rebuild-http-statuses.txt" DOWN_COUNT=0 for host in $HOSTNAMES; do [ "$host" = "ssh.mana.how" ] && continue - code=$(curl -s -o /dev/null -m 8 -w "%{http_code}" "https://$host" 2>/dev/null || echo "000") - printf "%-35s %s\n" "$host" "$code" | tee -a "$VERIFY_FILE" - if [ "$code" = "000" ] || [ "$code" = "404" ]; then + code=$(probe_host "$host") + label=$(probe_label "$code") + printf "%-35s %s %s\n" "$host" "$code" "$label" | tee -a "$VERIFY_FILE" + if probe_is_down "$code"; then DOWN_COUNT=$((DOWN_COUNT + 1)) fi done