mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-20 07:03:36 +02:00
Per-provider liveness with circuit-breaker semantics. The router (M3)
will read `is_healthy()` to skip dead providers in a chain; the probe
loop and the call-site fallback handler write state via
`mark_healthy` / `mark_unhealthy`.
State machine: 1st failure stays healthy (transient blips happen);
2nd consecutive failure trips the breaker and sets a 60s backoff
window during which `is_healthy → False`. After the window the
provider is half-open again — next call exercises it, success
resets, failure re-arms.
HealthProbe is the background asyncio.Task that pings every
registered provider every 30s with a 3s timeout. Probes run
concurrently per tick and one bad probe can't sink the loop. Probe
functions are injected (`{name: async-fn}`) so this module stays
decoupled from the provider classes — the wiring lives in main.py
where we already know which providers are configured.
32 new tests (FakeClock for deterministic backoff timing, slow-probe
helpers for parallelism + timeout, lifecycle tests for start/stop
idempotency and tick-after-error survival). 64/64 alias+health tests
green.
Not yet wired into the request path — that's M3.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
210 lines
7.5 KiB
Python
210 lines
7.5 KiB
Python
"""Background probe loop that keeps :class:`ProviderHealthCache` fresh.
|
|
|
|
The router's circuit-breaker is reactive — it only learns a provider is
|
|
sick after the next live request fails. A reactive-only design means:
|
|
|
|
* every cold start re-discovers Ollama is down by paying one 75-second
|
|
``ConnectError``, and
|
|
* a provider that quietly recovers stays marked unhealthy until its
|
|
backoff expires and someone tries it.
|
|
|
|
The probe loop closes both gaps. Every ``interval`` seconds it pings
|
|
each registered provider with a small known-cheap request (Ollama:
|
|
``GET /api/tags``, OpenAI-compat: ``GET /v1/models``) and updates the
|
|
cache. Probes run concurrently per tick and respect a hard
|
|
``probe_timeout`` so a hanging provider can't stall the loop.
|
|
|
|
Probe functions are injected from outside (``probes={name: async-fn}``)
|
|
so this module stays decoupled from the provider classes — wiring lives
|
|
in ``main.py`` where we already know which providers are configured.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
from typing import Awaitable, Callable
|
|
|
|
from .health import ProviderHealthCache
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
#: Probe function: returns ``True`` for healthy. Raising or returning
|
|
#: ``False`` both count as a failure (the loop just calls
|
|
#: ``mark_unhealthy``); an exception's string form becomes the
|
|
#: ``last_error`` for the snapshot endpoint.
|
|
ProbeFn = Callable[[], Awaitable[bool]]
|
|
|
|
DEFAULT_PROBE_INTERVAL_SEC = 30.0
|
|
DEFAULT_PROBE_TIMEOUT_SEC = 3.0
|
|
|
|
|
|
class HealthProbe:
|
|
"""Periodically probes every registered provider and updates the cache.
|
|
|
|
Lifecycle::
|
|
|
|
probe = HealthProbe(cache, {"ollama": probe_ollama, ...})
|
|
await probe.start() # spawns the background task
|
|
...
|
|
await probe.stop() # cancels and awaits cleanup
|
|
|
|
Tests typically call :meth:`tick_once` directly to exercise one cycle
|
|
without driving the asyncio scheduler through real ``asyncio.sleep``.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
cache: ProviderHealthCache,
|
|
probes: dict[str, ProbeFn],
|
|
*,
|
|
interval: float = DEFAULT_PROBE_INTERVAL_SEC,
|
|
timeout: float = DEFAULT_PROBE_TIMEOUT_SEC,
|
|
) -> None:
|
|
if interval <= 0:
|
|
raise ValueError("interval must be > 0")
|
|
if timeout <= 0:
|
|
raise ValueError("timeout must be > 0")
|
|
self._cache = cache
|
|
self._probes = dict(probes)
|
|
self._interval = interval
|
|
self._timeout = timeout
|
|
self._task: asyncio.Task | None = None
|
|
self._stop = asyncio.Event()
|
|
|
|
@property
|
|
def interval(self) -> float:
|
|
return self._interval
|
|
|
|
@property
|
|
def timeout(self) -> float:
|
|
return self._timeout
|
|
|
|
@property
|
|
def provider_ids(self) -> list[str]:
|
|
return list(self._probes)
|
|
|
|
@property
|
|
def running(self) -> bool:
|
|
return self._task is not None and not self._task.done()
|
|
|
|
# ------------------------------------------------------------------
|
|
# Per-tick logic — exercised directly by tests
|
|
# ------------------------------------------------------------------
|
|
|
|
async def tick_once(self) -> None:
|
|
"""Probe every provider once, in parallel, updating the cache.
|
|
|
|
Errors in any one probe (including ``asyncio.TimeoutError``) are
|
|
captured per-provider — one bad probe never sinks the loop.
|
|
"""
|
|
if not self._probes:
|
|
return
|
|
results = await asyncio.gather(
|
|
*(self._probe_one(name, fn) for name, fn in self._probes.items()),
|
|
return_exceptions=True,
|
|
)
|
|
# gather(return_exceptions=True) caught everything per-probe; this
|
|
# branch should never fire, but guard in case _probe_one ever grows
|
|
# a code path that bypasses its try/except.
|
|
# asyncio.gather() returns results in input order and same length,
|
|
# so the zip is a 1:1 mapping back to provider names.
|
|
for name, result in zip(self._probes, results):
|
|
if isinstance(result, BaseException):
|
|
logger.error("probe %s leaked exception: %s", name, result)
|
|
|
|
async def _probe_one(self, name: str, fn: ProbeFn) -> None:
|
|
try:
|
|
healthy = await asyncio.wait_for(fn(), timeout=self._timeout)
|
|
except asyncio.TimeoutError:
|
|
self._cache.mark_unhealthy(name, f"probe-timeout (>{self._timeout:.0f}s)")
|
|
return
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as e: # noqa: BLE001 — probe SHOULD be permissive
|
|
self._cache.mark_unhealthy(name, f"probe-exception: {type(e).__name__}: {e}")
|
|
return
|
|
if healthy:
|
|
self._cache.mark_healthy(name)
|
|
else:
|
|
self._cache.mark_unhealthy(name, "probe-returned-false")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Long-running task management
|
|
# ------------------------------------------------------------------
|
|
|
|
async def start(self) -> None:
|
|
"""Spawn the periodic probe task. Idempotent."""
|
|
if self.running:
|
|
return
|
|
self._stop.clear()
|
|
self._task = asyncio.create_task(self._run_forever(), name="mana-llm-health-probe")
|
|
logger.info(
|
|
"HealthProbe started (interval=%.0fs, timeout=%.0fs, providers=%s)",
|
|
self._interval,
|
|
self._timeout,
|
|
", ".join(self._probes) or "<none>",
|
|
)
|
|
|
|
async def stop(self) -> None:
|
|
"""Cancel the background task and wait for it to finish."""
|
|
if not self.running:
|
|
return
|
|
self._stop.set()
|
|
assert self._task is not None
|
|
self._task.cancel()
|
|
try:
|
|
await self._task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
finally:
|
|
self._task = None
|
|
logger.info("HealthProbe stopped")
|
|
|
|
async def _run_forever(self) -> None:
|
|
# Probe immediately at boot so we don't serve traffic for `interval`
|
|
# seconds based on optimistic-default assumptions.
|
|
try:
|
|
await self.tick_once()
|
|
except Exception as e: # noqa: BLE001
|
|
logger.error("HealthProbe initial tick failed: %s", e)
|
|
while not self._stop.is_set():
|
|
try:
|
|
await asyncio.wait_for(self._stop.wait(), timeout=self._interval)
|
|
except asyncio.TimeoutError:
|
|
pass
|
|
else:
|
|
# _stop.wait() succeeded → stop signalled, exit.
|
|
return
|
|
try:
|
|
await self.tick_once()
|
|
except Exception as e: # noqa: BLE001
|
|
logger.error("HealthProbe tick failed: %s", e)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Probe-function helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def make_http_probe(
|
|
url: str,
|
|
*,
|
|
headers: dict[str, str] | None = None,
|
|
expected_status_lt: int = 500,
|
|
) -> ProbeFn:
|
|
"""Return a probe function that does ``GET <url>`` and considers the
|
|
provider healthy iff the response status is below
|
|
``expected_status_lt`` (default: any non-5xx counts).
|
|
|
|
A 401/403/404 still counts as healthy because the *server* answered —
|
|
auth or path mistakes are misconfiguration, not provider liveness.
|
|
"""
|
|
import httpx
|
|
|
|
async def probe() -> bool:
|
|
async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client:
|
|
resp = await client.get(url, headers=headers or None)
|
|
return resp.status_code < expected_status_lt
|
|
|
|
return probe
|