managarten/services/mana-llm/src/health_probe.py
Till JS 59557e62d7 feat(mana-llm): M2 — ProviderHealthCache + background probe loop
Per-provider liveness with circuit-breaker semantics. The router (M3)
will read `is_healthy()` to skip dead providers in a chain; the probe
loop and the call-site fallback handler write state via
`mark_healthy` / `mark_unhealthy`.

State machine: 1st failure stays healthy (transient blips happen);
2nd consecutive failure trips the breaker and sets a 60s backoff
window during which `is_healthy → False`. After the window the
provider is half-open again — next call exercises it, success
resets, failure re-arms.

HealthProbe is the background asyncio.Task that pings every
registered provider every 30s with a 3s timeout. Probes run
concurrently per tick and one bad probe can't sink the loop. Probe
functions are injected (`{name: async-fn}`) so this module stays
decoupled from the provider classes — the wiring lives in main.py
where we already know which providers are configured.

32 new tests (FakeClock for deterministic backoff timing, slow-probe
helpers for parallelism + timeout, lifecycle tests for start/stop
idempotency and tick-after-error survival). 64/64 alias+health tests
green.

Not yet wired into the request path — that's M3.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 20:29:57 +02:00

210 lines
7.5 KiB
Python

"""Background probe loop that keeps :class:`ProviderHealthCache` fresh.
The router's circuit-breaker is reactive — it only learns a provider is
sick after the next live request fails. A reactive-only design means:
* every cold start re-discovers Ollama is down by paying one 75-second
``ConnectError``, and
* a provider that quietly recovers stays marked unhealthy until its
backoff expires and someone tries it.
The probe loop closes both gaps. Every ``interval`` seconds it pings
each registered provider with a small known-cheap request (Ollama:
``GET /api/tags``, OpenAI-compat: ``GET /v1/models``) and updates the
cache. Probes run concurrently per tick and respect a hard
``probe_timeout`` so a hanging provider can't stall the loop.
Probe functions are injected from outside (``probes={name: async-fn}``)
so this module stays decoupled from the provider classes — wiring lives
in ``main.py`` where we already know which providers are configured.
"""
from __future__ import annotations
import asyncio
import logging
from typing import Awaitable, Callable
from .health import ProviderHealthCache
logger = logging.getLogger(__name__)
#: Probe function: returns ``True`` for healthy. Raising or returning
#: ``False`` both count as a failure (the loop just calls
#: ``mark_unhealthy``); an exception's string form becomes the
#: ``last_error`` for the snapshot endpoint.
ProbeFn = Callable[[], Awaitable[bool]]
DEFAULT_PROBE_INTERVAL_SEC = 30.0
DEFAULT_PROBE_TIMEOUT_SEC = 3.0
class HealthProbe:
"""Periodically probes every registered provider and updates the cache.
Lifecycle::
probe = HealthProbe(cache, {"ollama": probe_ollama, ...})
await probe.start() # spawns the background task
...
await probe.stop() # cancels and awaits cleanup
Tests typically call :meth:`tick_once` directly to exercise one cycle
without driving the asyncio scheduler through real ``asyncio.sleep``.
"""
def __init__(
self,
cache: ProviderHealthCache,
probes: dict[str, ProbeFn],
*,
interval: float = DEFAULT_PROBE_INTERVAL_SEC,
timeout: float = DEFAULT_PROBE_TIMEOUT_SEC,
) -> None:
if interval <= 0:
raise ValueError("interval must be > 0")
if timeout <= 0:
raise ValueError("timeout must be > 0")
self._cache = cache
self._probes = dict(probes)
self._interval = interval
self._timeout = timeout
self._task: asyncio.Task | None = None
self._stop = asyncio.Event()
@property
def interval(self) -> float:
return self._interval
@property
def timeout(self) -> float:
return self._timeout
@property
def provider_ids(self) -> list[str]:
return list(self._probes)
@property
def running(self) -> bool:
return self._task is not None and not self._task.done()
# ------------------------------------------------------------------
# Per-tick logic — exercised directly by tests
# ------------------------------------------------------------------
async def tick_once(self) -> None:
"""Probe every provider once, in parallel, updating the cache.
Errors in any one probe (including ``asyncio.TimeoutError``) are
captured per-provider — one bad probe never sinks the loop.
"""
if not self._probes:
return
results = await asyncio.gather(
*(self._probe_one(name, fn) for name, fn in self._probes.items()),
return_exceptions=True,
)
# gather(return_exceptions=True) caught everything per-probe; this
# branch should never fire, but guard in case _probe_one ever grows
# a code path that bypasses its try/except.
# asyncio.gather() returns results in input order and same length,
# so the zip is a 1:1 mapping back to provider names.
for name, result in zip(self._probes, results):
if isinstance(result, BaseException):
logger.error("probe %s leaked exception: %s", name, result)
async def _probe_one(self, name: str, fn: ProbeFn) -> None:
try:
healthy = await asyncio.wait_for(fn(), timeout=self._timeout)
except asyncio.TimeoutError:
self._cache.mark_unhealthy(name, f"probe-timeout (>{self._timeout:.0f}s)")
return
except asyncio.CancelledError:
raise
except Exception as e: # noqa: BLE001 — probe SHOULD be permissive
self._cache.mark_unhealthy(name, f"probe-exception: {type(e).__name__}: {e}")
return
if healthy:
self._cache.mark_healthy(name)
else:
self._cache.mark_unhealthy(name, "probe-returned-false")
# ------------------------------------------------------------------
# Long-running task management
# ------------------------------------------------------------------
async def start(self) -> None:
"""Spawn the periodic probe task. Idempotent."""
if self.running:
return
self._stop.clear()
self._task = asyncio.create_task(self._run_forever(), name="mana-llm-health-probe")
logger.info(
"HealthProbe started (interval=%.0fs, timeout=%.0fs, providers=%s)",
self._interval,
self._timeout,
", ".join(self._probes) or "<none>",
)
async def stop(self) -> None:
"""Cancel the background task and wait for it to finish."""
if not self.running:
return
self._stop.set()
assert self._task is not None
self._task.cancel()
try:
await self._task
except asyncio.CancelledError:
pass
finally:
self._task = None
logger.info("HealthProbe stopped")
async def _run_forever(self) -> None:
# Probe immediately at boot so we don't serve traffic for `interval`
# seconds based on optimistic-default assumptions.
try:
await self.tick_once()
except Exception as e: # noqa: BLE001
logger.error("HealthProbe initial tick failed: %s", e)
while not self._stop.is_set():
try:
await asyncio.wait_for(self._stop.wait(), timeout=self._interval)
except asyncio.TimeoutError:
pass
else:
# _stop.wait() succeeded → stop signalled, exit.
return
try:
await self.tick_once()
except Exception as e: # noqa: BLE001
logger.error("HealthProbe tick failed: %s", e)
# ---------------------------------------------------------------------------
# Probe-function helpers
# ---------------------------------------------------------------------------
def make_http_probe(
url: str,
*,
headers: dict[str, str] | None = None,
expected_status_lt: int = 500,
) -> ProbeFn:
"""Return a probe function that does ``GET <url>`` and considers the
provider healthy iff the response status is below
``expected_status_lt`` (default: any non-5xx counts).
A 401/403/404 still counts as healthy because the *server* answered —
auth or path mistakes are misconfiguration, not provider liveness.
"""
import httpx
async def probe() -> bool:
async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client:
resp = await client.get(url, headers=headers or None)
return resp.status_code < expected_status_lt
return probe