fix(geocoding): bump PROVIDER_TIMEOUT_MS to 20s for cold cross-LAN

Cold-start fetches from the mana-geocoding container to photon-self on mana-gpu (over WSL2 mirrored networking) consistently take >10s on the first probe and ~2s once warm. The previous 8s default caused the chain to false-mark photon-self unhealthy on every cold path, leaking to public photon for the next 30s health-cache window — and pinning the public-photon answer in the 7d cache (now shortened to 1h). Also wires the docker-compose macmini env to honor PROVIDER_TIMEOUT_MS and CACHE_PUBLIC_TTL_MS overrides so production picks up the new values without a code rebuild. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 19:41:09 +02:00 · 2026-04-28 22:19:21 +02:00 · 2026-04-28 22:19:21 +02:00 · 8a5fad34df
commit 8a5fad34df
parent 962606b961
3 changed files with 18 additions and 6 deletions
--- a/docker-compose.macmini.yml
+++ b/docker-compose.macmini.yml
@ -510,6 +510,14 @@ services:
      # as `photon-self` provider with privacy: 'local' — eligible for
      # sensitive queries. Empty value = slot disabled.
      PHOTON_SELF_API_URL: ${PHOTON_SELF_API_URL:-}
+      # Cold-start cross-LAN fetches to photon-self consistently take
+      # >10s on the first probe; the 8s default false-marked it unhealthy
+      # on every cold path. 20s leaves headroom while still cutting off
+      # actually-stuck connections.
+      PROVIDER_TIMEOUT_MS: ${PROVIDER_TIMEOUT_MS:-20000}
+      # Short public-API cache TTL so a transient photon-self blip can't
+      # pin stale public-fallback answers in the LRU for days.
+      CACHE_PUBLIC_TTL_MS: ${CACHE_PUBLIC_TTL_MS:-3600000}
      CORS_ORIGINS: https://mana.how,http://localhost:5173
      CACHE_MAX_ENTRIES: "5000"
      CACHE_TTL_MS: "86400000"
--- a/services/mana-geocoding/CLAUDE.md
+++ b/services/mana-geocoding/CLAUDE.md
@ -140,7 +140,10 @@ PORT=3018
 # Default order: photon-self,photon,nominatim
 # `photon-self` is silently dropped if PHOTON_SELF_API_URL is unset.
 GEOCODING_PROVIDERS=photon-self,photon,nominatim
-PROVIDER_TIMEOUT_MS=8000              # per-provider request timeout (cold-start safe)
+PROVIDER_TIMEOUT_MS=20000             # per-provider request timeout. Cold-start
+                                      # cross-LAN fetches to photon-self take
+                                      # >10s on the first probe; tighter values
+                                      # false-mark it unhealthy on every cold path.
 PROVIDER_HEALTH_CACHE_MS=30000        # health-cache TTL — skip dead providers

 # --- Self-hosted Photon (privacy: 'local', PRIMARY since 2026-04-28) --
--- a/services/mana-geocoding/src/config.ts
+++ b/services/mana-geocoding/src/config.ts
@ -93,11 +93,12 @@ export function loadConfig(): Config {
 				'nominatim',
 			]),
 			healthCacheMs: parseInt(process.env.PROVIDER_HEALTH_CACHE_MS || '30000', 10),
-			// 8 s default. Nominatim's cold-start DNS+TLS handshake can push the
-			// first health probe past the older 5 s default, false-marking the
-			// provider unhealthy for the next 30 s. 8 s survives a slow first
-			// probe but still cuts off actually-stuck connections.
-			timeoutMs: parseInt(process.env.PROVIDER_TIMEOUT_MS || '8000', 10),
+			// 20 s default. Cold-start cross-LAN fetches to photon-self
+			// (mana-gpu over WSL2 mirrored networking) consistently take
+			// >10 s on the first probe and ~2 s once warm. Tighter timeouts
+			// false-marked photon-self unhealthy on every cold path, leaking
+			// to public photon for the duration of the 30 s health cache.
+			timeoutMs: parseInt(process.env.PROVIDER_TIMEOUT_MS || '20000', 10),
 		},
 	};
 }