fix(mana-llm): surface Gemini finish_reason errors instead of returning ""

The google provider called response.text after a chat completion and
passed the resulting string downstream unchanged. When Gemini's content
filter, recitation guard, or max_tokens ceiling fired, response.text
quietly returned "" — which the planner then reported as "no JSON block
found", masking the real cause. Empirically this failed in 45 ms on a
simple Quiz mission.

Introduces providers/errors.py with a small ProviderError hierarchy
(Blocked / Truncated / Auth / RateLimit / Capability). google.py now
inspects response.candidates[0].finish_reason and raises the matching
structured error; the non-streaming path maps it to 422/502/429 via a
new except-branch in main.py, and the streaming path surfaces the kind
as the SSE error type. Capability is wired but not yet used — it lands
with the tool-schema passthrough in the next commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-20 15:15:37 +02:00
parent c34175afab
commit 4b8fede7fc
4 changed files with 207 additions and 12 deletions

View file

@ -21,6 +21,7 @@ from src.models import (
ModelsResponse,
)
from src.providers import ProviderRouter
from src.providers.errors import ProviderError
from src.streaming import stream_chat_completion
from src.utils.cache import close_redis
from src.utils.metrics import get_metrics, record_llm_error, record_llm_request
@ -180,6 +181,15 @@ async def chat_completions(
logger.error(f"Invalid request: {e}")
record_llm_error(provider, model, "invalid_request")
raise HTTPException(status_code=400, detail=str(e))
except ProviderError as e:
logger.warning(
f"Provider error on {provider}/{model}: kind={e.kind} detail={e}"
)
record_llm_error(provider, model, e.kind)
raise HTTPException(
status_code=e.http_status,
detail={"kind": e.kind, "message": str(e)},
)
except Exception as e:
logger.error(f"Chat completion failed: {e}")
record_llm_error(provider, model, "server_error")

View file

@ -0,0 +1,78 @@
"""Structured provider errors.
These map to distinct HTTP status codes and metric labels so callers can
distinguish between "model refused to answer" (blocked), "hit the token
budget" (truncated), "auth is broken", "we were rate-limited", and
"model doesn't support what we asked" (capability). The old behaviour of
returning an empty-string response on content-filter hits silently
corrupts downstream pipelines (e.g. the planner sees "" and reports
"no JSON block found" misleading).
"""
class ProviderError(Exception):
"""Base class for structured provider errors."""
# Short stable identifier used in metrics and API responses.
kind: str = "unknown"
# Suggested HTTP status when this error bubbles out of an endpoint.
http_status: int = 502
class ProviderBlockedError(ProviderError):
"""The provider refused to return content (safety, recitation, …).
The model produced nothing usable because a content filter or policy
guardrail fired. Retry with the same inputs will fail again the
caller needs to adjust the prompt or give the user a clear message.
"""
kind = "blocked"
http_status = 422
def __init__(self, reason: str, detail: str | None = None):
self.reason = reason # e.g. "SAFETY", "RECITATION"
self.detail = detail
msg = f"Provider blocked the response ({reason})"
if detail:
msg += f": {detail}"
super().__init__(msg)
class ProviderTruncatedError(ProviderError):
"""The response was cut off before completion (hit max_tokens)."""
kind = "truncated"
http_status = 502
def __init__(self, partial_text: str | None = None):
self.partial_text = partial_text
super().__init__(
"Provider truncated the response (max_tokens reached) — "
"re-run with a higher max_tokens or smaller input"
)
class ProviderAuthError(ProviderError):
"""Authentication against the upstream provider failed."""
kind = "auth"
http_status = 502 # not 401 — the client's auth is fine, *ours* isn't
class ProviderRateLimitError(ProviderError):
"""The upstream provider rate-limited us."""
kind = "rate_limit"
http_status = 429
class ProviderCapabilityError(ProviderError):
"""The requested feature is not supported by the chosen model.
Typically raised when a request asks for native tool-calling against
a model that does not support it. We refuse to silently degrade.
"""
kind = "capability"
http_status = 400

View file

@ -24,9 +24,86 @@ from src.models import (
)
from .base import LLMProvider
from .errors import (
ProviderAuthError,
ProviderBlockedError,
ProviderError,
ProviderRateLimitError,
ProviderTruncatedError,
)
logger = logging.getLogger(__name__)
def _unwrap_gemini_response(response: Any, gemini_model: str) -> str:
"""Validate a non-streaming Gemini response and return its text.
Raises a structured ProviderError if the response was blocked,
truncated, or otherwise produced no usable text. The SDK's
``response.text`` accessor silently returns an empty string in all
of those cases, which downstream consumers (e.g. the planner
parser) cannot distinguish from a well-formed empty completion.
"""
candidates = getattr(response, "candidates", None) or []
candidate = candidates[0] if candidates else None
finish_reason = getattr(candidate, "finish_reason", None)
# SDK sometimes exposes the enum name on `.name`, sometimes it's a string.
finish_name = getattr(finish_reason, "name", None) or (
str(finish_reason) if finish_reason is not None else None
)
# Strip the leading enum prefix if present (e.g. "FinishReason.SAFETY").
if finish_name and "." in finish_name:
finish_name = finish_name.rsplit(".", 1)[-1]
text = response.text or ""
if finish_name in {"SAFETY", "RECITATION", "PROHIBITED_CONTENT", "SPII", "BLOCKLIST"}:
# Pull the first safety rating that actually blocked if present.
ratings = getattr(candidate, "safety_ratings", None) or []
blocked = [
getattr(r, "category", None)
for r in ratings
if getattr(r, "blocked", False)
]
detail = ", ".join(str(c) for c in blocked if c) or None
logger.warning(
"Gemini response blocked (model=%s, reason=%s, detail=%s)",
gemini_model,
finish_name,
detail,
)
raise ProviderBlockedError(reason=finish_name, detail=detail)
if finish_name == "MAX_TOKENS":
logger.warning(
"Gemini response truncated at max_tokens (model=%s)", gemini_model
)
raise ProviderTruncatedError(partial_text=text or None)
if not text and finish_name not in (None, "STOP"):
# Unknown finish reason, empty text — surface instead of silent "".
raise ProviderError(
f"Gemini returned no content (finish_reason={finish_name})"
)
return text
def _wrap_gemini_call_error(err: Exception, gemini_model: str) -> ProviderError:
"""Translate a raw Google SDK exception into a structured ProviderError.
The SDK uses google.genai.errors.* but we avoid importing them at
top level to keep the provider optional. String-match the class
name instead.
"""
cls_name = type(err).__name__
msg = str(err) or cls_name
if "Auth" in cls_name or "PermissionDenied" in cls_name or "Unauthenticated" in cls_name:
return ProviderAuthError(f"Gemini auth failed for {gemini_model}: {msg}")
if "ResourceExhausted" in cls_name or "RateLimit" in cls_name or "429" in msg:
return ProviderRateLimitError(f"Gemini rate-limited for {gemini_model}: {msg}")
return ProviderError(f"Gemini call failed for {gemini_model}: {msg}")
# Model mapping: Ollama model → Google Gemini equivalent
OLLAMA_TO_GEMINI: dict[str, str] = {
"gemma3:4b": "gemini-2.5-flash",
@ -129,13 +206,18 @@ class GoogleProvider(LLMProvider):
logger.debug(f"Google Gemini request: {gemini_model}, messages: {len(contents)}")
response = await self.client.aio.models.generate_content(
model=gemini_model,
contents=contents,
config=gen_config,
)
try:
response = await self.client.aio.models.generate_content(
model=gemini_model,
contents=contents,
config=gen_config,
)
except ProviderError:
raise
except Exception as err:
raise _wrap_gemini_call_error(err, gemini_model) from err
content = response.text or ""
content = _unwrap_gemini_response(response, gemini_model)
usage_meta = response.usage_metadata
return ChatCompletionResponse(
@ -187,13 +269,22 @@ class GoogleProvider(LLMProvider):
],
)
async for chunk in await self.client.aio.models.generate_content_stream(
model=gemini_model,
contents=contents,
config=gen_config,
):
last_chunk: Any = None
emitted_any_text = False
try:
stream = await self.client.aio.models.generate_content_stream(
model=gemini_model,
contents=contents,
config=gen_config,
)
except Exception as err:
raise _wrap_gemini_call_error(err, gemini_model) from err
async for chunk in stream:
last_chunk = chunk
text = chunk.text
if text:
emitted_any_text = True
yield ChatCompletionStreamResponse(
model=f"google/{gemini_model}",
choices=[
@ -204,6 +295,12 @@ class GoogleProvider(LLMProvider):
],
)
# Post-stream check: if the stream ended without emitting any text,
# surface the structured reason instead of quietly closing with an
# empty "stop". Matches _unwrap_gemini_response semantics.
if not emitted_any_text and last_chunk is not None:
_unwrap_gemini_response(last_chunk, gemini_model)
# Final chunk
yield ChatCompletionStreamResponse(
model=f"google/{gemini_model}",

View file

@ -6,6 +6,7 @@ from collections.abc import AsyncIterator
from src.models import ChatCompletionRequest, ChatCompletionStreamResponse
from src.providers import ProviderRouter
from src.providers.errors import ProviderError
logger = logging.getLogger(__name__)
@ -29,9 +30,18 @@ async def stream_chat_completion(
# Send final [DONE] marker
yield {"data": "[DONE]"}
except ProviderError as e:
logger.warning(f"Streaming provider error: kind={e.kind} detail={e}")
error_data = {
"error": {
"message": str(e),
"type": e.kind,
}
}
yield {"data": json.dumps(error_data)}
yield {"data": "[DONE]"}
except Exception as e:
logger.error(f"Streaming error: {e}")
# Send error as SSE event
error_data = {
"error": {
"message": str(e),