mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-23 07:06:41 +02:00
feat(mana-llm): M4 — observability, debug endpoints, SIGHUP reload
- `X-Mana-LLM-Resolved: <provider>/<model>` header on non-streaming
responses. Streaming clients read the same info from each chunk's
`model` field (SSE headers go out before the chain is walked).
- Three new Prometheus metrics: `mana_llm_alias_resolved_total{alias,
target}` (which concrete model an alias resolved to per request),
`mana_llm_fallback_total{from_model, to_model, reason}` (each
fallback transition), `mana_llm_provider_healthy{provider}` (gauge,
mirrors the circuit-breaker).
- New debug endpoints: `GET /v1/aliases` (registry inspection — chain
+ description per alias, useful for confirming SIGHUP reloads),
`GET /v1/health` (full per-provider liveness snapshot — failure
counter, last error, unhealthy-until backoff).
- `kill -HUP <pid>` reloads `aliases.yaml`. Parse errors leave the
previous good state in memory and log the rejection.
- `ProviderHealthCache.add_listener()` for cache→metrics decoupling:
the gauge is updated via a transition-only listener wired in main.py
rather than the cache importing prometheus_client itself.
- Request-side metrics now use the requested model string, success-side
uses the resolved one. So `mana_llm_llm_requests_total{provider="ollama",
model="gemma3:12b"}` reflects actual upstream load even when callers
used `mana/long-form` aliases.
16 new observability tests (test_m4_observability.py): listener
fire-on-transition semantics, exception-isolation, multi-listener,
counter increments, gauge writes, end-to-end alias→metric flow,
v1/aliases + v1/health endpoint shape, response.model carries the
resolved target after fallback. Total suite: 115/115 in 1.6s.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3046da3b19
commit
8a49e3ffd5
6 changed files with 749 additions and 29 deletions
|
|
@ -4,7 +4,7 @@ import time
|
|||
from collections.abc import Callable
|
||||
|
||||
from fastapi import Request, Response
|
||||
from prometheus_client import Counter, Histogram, generate_latest
|
||||
from prometheus_client import Counter, Gauge, Histogram, generate_latest
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
|
||||
# Request metrics
|
||||
|
|
@ -47,6 +47,35 @@ LLM_ERRORS = Counter(
|
|||
["provider", "model", "error_type"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Alias / fallback / health metrics — added in M4 of llm-fallback-aliases.md.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ALIAS_RESOLVED = Counter(
|
||||
"mana_llm_alias_resolved_total",
|
||||
"How often an alias resolved to a concrete provider/model. The `target` "
|
||||
"label is the chain entry that actually served the request — useful for "
|
||||
"spotting cases where the primary always falls through to a cloud entry.",
|
||||
["alias", "target"],
|
||||
)
|
||||
|
||||
FALLBACK_TRIGGERED = Counter(
|
||||
"mana_llm_fallback_total",
|
||||
"Fallback transitions: a chain entry failed (or was skipped via cache) "
|
||||
"and the router moved to the next entry. `reason` is the exception class "
|
||||
"name or `cache-unhealthy` / `unconfigured`. `from_model` is the entry "
|
||||
"that didn't serve, `to_model` is empty when no further entries existed.",
|
||||
["from_model", "to_model", "reason"],
|
||||
)
|
||||
|
||||
PROVIDER_HEALTHY = Gauge(
|
||||
"mana_llm_provider_healthy",
|
||||
"1 when the provider is currently considered healthy by the cache, "
|
||||
"0 when in backoff. Refreshed on every probe tick and on every router "
|
||||
"call-site state transition.",
|
||||
["provider"],
|
||||
)
|
||||
|
||||
|
||||
def get_metrics() -> bytes:
|
||||
"""Generate Prometheus metrics output."""
|
||||
|
|
@ -107,3 +136,23 @@ def record_llm_request(
|
|||
def record_llm_error(provider: str, model: str, error_type: str) -> None:
|
||||
"""Record LLM error metrics."""
|
||||
LLM_ERRORS.labels(provider=provider, model=model, error_type=error_type).inc()
|
||||
|
||||
|
||||
def record_alias_resolved(alias: str, target: str) -> None:
|
||||
"""Record which concrete model an alias resolved to for this request."""
|
||||
ALIAS_RESOLVED.labels(alias=alias, target=target).inc()
|
||||
|
||||
|
||||
def record_fallback(from_model: str, to_model: str, reason: str) -> None:
|
||||
"""Record a fallback transition. ``to_model`` is empty when the chain
|
||||
ran out (i.e. NoHealthyProviderError)."""
|
||||
FALLBACK_TRIGGERED.labels(
|
||||
from_model=from_model,
|
||||
to_model=to_model,
|
||||
reason=reason,
|
||||
).inc()
|
||||
|
||||
|
||||
def set_provider_healthy(provider: str, healthy: bool) -> None:
|
||||
"""Mirror ``ProviderHealthCache`` state into a Prometheus gauge."""
|
||||
PROVIDER_HEALTHY.labels(provider=provider).set(1.0 if healthy else 0.0)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue