managarten/services/mana-llm/tests/test_health_probe.py
Till JS 59557e62d7 feat(mana-llm): M2 — ProviderHealthCache + background probe loop
Per-provider liveness with circuit-breaker semantics. The router (M3)
will read `is_healthy()` to skip dead providers in a chain; the probe
loop and the call-site fallback handler write state via
`mark_healthy` / `mark_unhealthy`.

State machine: 1st failure stays healthy (transient blips happen);
2nd consecutive failure trips the breaker and sets a 60s backoff
window during which `is_healthy → False`. After the window the
provider is half-open again — next call exercises it, success
resets, failure re-arms.

HealthProbe is the background asyncio.Task that pings every
registered provider every 30s with a 3s timeout. Probes run
concurrently per tick and one bad probe can't sink the loop. Probe
functions are injected (`{name: async-fn}`) so this module stays
decoupled from the provider classes — the wiring lives in main.py
where we already know which providers are configured.

32 new tests (FakeClock for deterministic backoff timing, slow-probe
helpers for parallelism + timeout, lifecycle tests for start/stop
idempotency and tick-after-error survival). 64/64 alias+health tests
green.

Not yet wired into the request path — that's M3.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 20:29:57 +02:00

222 lines
7.7 KiB
Python

"""Tests for the background health-probe loop."""
from __future__ import annotations
import asyncio
from typing import Awaitable, Callable
import pytest
from src.health import ProviderHealthCache
from src.health_probe import HealthProbe, ProbeFn
def make_probe(*, returns: bool = True, raises: type[BaseException] | None = None) -> ProbeFn:
"""Synthesise a probe function that always returns / raises the same."""
async def probe() -> bool:
if raises is not None:
raise raises("boom")
return returns
return probe
def make_slow_probe(delay: float, returns: bool = True) -> ProbeFn:
async def probe() -> bool:
await asyncio.sleep(delay)
return returns
return probe
def make_call_counter() -> tuple[ProbeFn, Callable[[], int]]:
"""Probe that counts how many times it was awaited."""
count = 0
async def probe() -> bool:
nonlocal count
count += 1
return True
return probe, lambda: count
# ---------------------------------------------------------------------------
# Construction
# ---------------------------------------------------------------------------
class TestConstruction:
def test_invalid_interval(self) -> None:
with pytest.raises(ValueError, match="interval"):
HealthProbe(ProviderHealthCache(), {}, interval=0.0)
def test_invalid_timeout(self) -> None:
with pytest.raises(ValueError, match="timeout"):
HealthProbe(ProviderHealthCache(), {}, timeout=0.0)
def test_provider_ids_exposes_keys(self) -> None:
cache = ProviderHealthCache()
probe = HealthProbe(
cache,
{"ollama": make_probe(), "groq": make_probe()},
)
assert sorted(probe.provider_ids) == ["groq", "ollama"]
# ---------------------------------------------------------------------------
# tick_once — the per-cycle behaviour
# ---------------------------------------------------------------------------
class TestTickOnce:
@pytest.mark.asyncio
async def test_healthy_probe_marks_healthy(self) -> None:
cache = ProviderHealthCache(failure_threshold=1)
# Pre-mark unhealthy so we can verify the probe recovers it.
cache.mark_unhealthy("ollama", "stale")
probe = HealthProbe(cache, {"ollama": make_probe(returns=True)})
await probe.tick_once()
assert cache.is_healthy("ollama") is True
@pytest.mark.asyncio
async def test_returning_false_marks_unhealthy(self) -> None:
cache = ProviderHealthCache(failure_threshold=1)
probe = HealthProbe(cache, {"ollama": make_probe(returns=False)})
await probe.tick_once()
assert cache.is_healthy("ollama") is False
state = cache.get_state("ollama")
assert state is not None
assert "false" in (state.last_error or "")
@pytest.mark.asyncio
async def test_raising_marks_unhealthy_with_exc_info(self) -> None:
cache = ProviderHealthCache(failure_threshold=1)
probe = HealthProbe(
cache, {"ollama": make_probe(raises=ConnectionError)}
)
await probe.tick_once()
assert cache.is_healthy("ollama") is False
state = cache.get_state("ollama")
assert state is not None
assert "ConnectionError" in (state.last_error or "")
@pytest.mark.asyncio
async def test_timeout_marks_unhealthy(self) -> None:
cache = ProviderHealthCache(failure_threshold=1)
probe = HealthProbe(
cache,
{"ollama": make_slow_probe(delay=1.0)},
timeout=0.05,
)
await probe.tick_once()
assert cache.is_healthy("ollama") is False
state = cache.get_state("ollama")
assert state is not None
assert "timeout" in (state.last_error or "").lower()
@pytest.mark.asyncio
async def test_one_bad_probe_does_not_sink_others(self) -> None:
# Probe 'ollama' raises — 'groq' must still be evaluated and marked
# healthy. Bug shape: an unhandled exception in gather() sinks the
# whole loop.
cache = ProviderHealthCache(failure_threshold=1)
probe = HealthProbe(
cache,
{
"ollama": make_probe(raises=RuntimeError),
"groq": make_probe(returns=True),
},
)
await probe.tick_once()
assert cache.is_healthy("ollama") is False
assert cache.is_healthy("groq") is True
@pytest.mark.asyncio
async def test_concurrent_probes(self) -> None:
# All probes should run in parallel — total elapsed wall-clock for
# N x 100ms probes should be well under N*100ms.
import time
cache = ProviderHealthCache()
probes = {f"p{i}": make_slow_probe(delay=0.1) for i in range(5)}
probe = HealthProbe(cache, probes, timeout=1.0)
t0 = time.perf_counter()
await probe.tick_once()
elapsed = time.perf_counter() - t0
assert elapsed < 0.3, f"probes ran serially? elapsed={elapsed:.3f}s"
@pytest.mark.asyncio
async def test_empty_probes_is_noop(self) -> None:
cache = ProviderHealthCache()
probe = HealthProbe(cache, {})
# No exception, no state mutation.
await probe.tick_once()
assert cache.snapshot() == {}
# ---------------------------------------------------------------------------
# start / stop lifecycle
# ---------------------------------------------------------------------------
class TestLifecycle:
@pytest.mark.asyncio
async def test_start_runs_initial_tick_immediately(self) -> None:
cache = ProviderHealthCache(failure_threshold=1)
cache.mark_unhealthy("ollama", "stale")
probe = HealthProbe(cache, {"ollama": make_probe(returns=True)}, interval=10.0)
await probe.start()
# Give the loop one event-loop turn to run the initial tick before
# blocking on the long sleep.
await asyncio.sleep(0.01)
assert cache.is_healthy("ollama") is True
await probe.stop()
@pytest.mark.asyncio
async def test_stop_cancels_cleanly(self) -> None:
cache = ProviderHealthCache()
probe = HealthProbe(
cache, {"ollama": make_probe()}, interval=10.0, timeout=1.0
)
await probe.start()
assert probe.running is True
await probe.stop()
assert probe.running is False
@pytest.mark.asyncio
async def test_start_is_idempotent(self) -> None:
cache = ProviderHealthCache()
probe = HealthProbe(cache, {"ollama": make_probe()}, interval=10.0)
await probe.start()
await probe.start() # must not spawn a second task
assert probe.running is True
await probe.stop()
@pytest.mark.asyncio
async def test_stop_without_start_is_safe(self) -> None:
cache = ProviderHealthCache()
probe = HealthProbe(cache, {})
await probe.stop() # idempotent / safe pre-start
@pytest.mark.asyncio
async def test_loop_keeps_running_after_tick_error(self) -> None:
# Even if every probe explodes, the loop must keep ticking.
cache = ProviderHealthCache(failure_threshold=1)
fn, count = make_call_counter()
# Wrap with one that raises — but tick_once internally catches
# per-probe via gather(return_exceptions=True). Force an outer error
# via an evil probe key that the dict can't handle? Easier: use the
# call-counter to verify multiple ticks happened.
probe = HealthProbe(
cache,
{"counter": fn},
interval=0.05, # short interval for test
timeout=1.0,
)
await probe.start()
await asyncio.sleep(0.18) # ~3-4 ticks
await probe.stop()
# Initial tick + at least 2 interval ticks.
assert count() >= 3