From 4b8fede7fc7acc2e589ca5a984b7c43173d0e2df Mon Sep 17 00:00:00 2001 From: Till JS Date: Mon, 20 Apr 2026 15:15:37 +0200 Subject: [PATCH] fix(mana-llm): surface Gemini finish_reason errors instead of returning "" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The google provider called response.text after a chat completion and passed the resulting string downstream unchanged. When Gemini's content filter, recitation guard, or max_tokens ceiling fired, response.text quietly returned "" — which the planner then reported as "no JSON block found", masking the real cause. Empirically this failed in 45 ms on a simple Quiz mission. Introduces providers/errors.py with a small ProviderError hierarchy (Blocked / Truncated / Auth / RateLimit / Capability). google.py now inspects response.candidates[0].finish_reason and raises the matching structured error; the non-streaming path maps it to 422/502/429 via a new except-branch in main.py, and the streaming path surfaces the kind as the SSE error type. Capability is wired but not yet used — it lands with the tool-schema passthrough in the next commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- services/mana-llm/src/main.py | 10 ++ services/mana-llm/src/providers/errors.py | 78 ++++++++++++++ services/mana-llm/src/providers/google.py | 119 ++++++++++++++++++++-- services/mana-llm/src/streaming/sse.py | 12 ++- 4 files changed, 207 insertions(+), 12 deletions(-) create mode 100644 services/mana-llm/src/providers/errors.py diff --git a/services/mana-llm/src/main.py b/services/mana-llm/src/main.py index b13519241..f04c4aaea 100644 --- a/services/mana-llm/src/main.py +++ b/services/mana-llm/src/main.py @@ -21,6 +21,7 @@ from src.models import ( ModelsResponse, ) from src.providers import ProviderRouter +from src.providers.errors import ProviderError from src.streaming import stream_chat_completion from src.utils.cache import close_redis from src.utils.metrics import get_metrics, record_llm_error, record_llm_request @@ -180,6 +181,15 @@ async def chat_completions( logger.error(f"Invalid request: {e}") record_llm_error(provider, model, "invalid_request") raise HTTPException(status_code=400, detail=str(e)) + except ProviderError as e: + logger.warning( + f"Provider error on {provider}/{model}: kind={e.kind} detail={e}" + ) + record_llm_error(provider, model, e.kind) + raise HTTPException( + status_code=e.http_status, + detail={"kind": e.kind, "message": str(e)}, + ) except Exception as e: logger.error(f"Chat completion failed: {e}") record_llm_error(provider, model, "server_error") diff --git a/services/mana-llm/src/providers/errors.py b/services/mana-llm/src/providers/errors.py new file mode 100644 index 000000000..f63efa17a --- /dev/null +++ b/services/mana-llm/src/providers/errors.py @@ -0,0 +1,78 @@ +"""Structured provider errors. + +These map to distinct HTTP status codes and metric labels so callers can +distinguish between "model refused to answer" (blocked), "hit the token +budget" (truncated), "auth is broken", "we were rate-limited", and +"model doesn't support what we asked" (capability). The old behaviour of +returning an empty-string response on content-filter hits silently +corrupts downstream pipelines (e.g. the planner sees "" and reports +"no JSON block found" — misleading). +""" + + +class ProviderError(Exception): + """Base class for structured provider errors.""" + + # Short stable identifier used in metrics and API responses. + kind: str = "unknown" + # Suggested HTTP status when this error bubbles out of an endpoint. + http_status: int = 502 + + +class ProviderBlockedError(ProviderError): + """The provider refused to return content (safety, recitation, …). + + The model produced nothing usable because a content filter or policy + guardrail fired. Retry with the same inputs will fail again — the + caller needs to adjust the prompt or give the user a clear message. + """ + + kind = "blocked" + http_status = 422 + + def __init__(self, reason: str, detail: str | None = None): + self.reason = reason # e.g. "SAFETY", "RECITATION" + self.detail = detail + msg = f"Provider blocked the response ({reason})" + if detail: + msg += f": {detail}" + super().__init__(msg) + + +class ProviderTruncatedError(ProviderError): + """The response was cut off before completion (hit max_tokens).""" + + kind = "truncated" + http_status = 502 + + def __init__(self, partial_text: str | None = None): + self.partial_text = partial_text + super().__init__( + "Provider truncated the response (max_tokens reached) — " + "re-run with a higher max_tokens or smaller input" + ) + + +class ProviderAuthError(ProviderError): + """Authentication against the upstream provider failed.""" + + kind = "auth" + http_status = 502 # not 401 — the client's auth is fine, *ours* isn't + + +class ProviderRateLimitError(ProviderError): + """The upstream provider rate-limited us.""" + + kind = "rate_limit" + http_status = 429 + + +class ProviderCapabilityError(ProviderError): + """The requested feature is not supported by the chosen model. + + Typically raised when a request asks for native tool-calling against + a model that does not support it. We refuse to silently degrade. + """ + + kind = "capability" + http_status = 400 diff --git a/services/mana-llm/src/providers/google.py b/services/mana-llm/src/providers/google.py index 43acfc695..1b204b2ee 100644 --- a/services/mana-llm/src/providers/google.py +++ b/services/mana-llm/src/providers/google.py @@ -24,9 +24,86 @@ from src.models import ( ) from .base import LLMProvider +from .errors import ( + ProviderAuthError, + ProviderBlockedError, + ProviderError, + ProviderRateLimitError, + ProviderTruncatedError, +) logger = logging.getLogger(__name__) + +def _unwrap_gemini_response(response: Any, gemini_model: str) -> str: + """Validate a non-streaming Gemini response and return its text. + + Raises a structured ProviderError if the response was blocked, + truncated, or otherwise produced no usable text. The SDK's + ``response.text`` accessor silently returns an empty string in all + of those cases, which downstream consumers (e.g. the planner + parser) cannot distinguish from a well-formed empty completion. + """ + candidates = getattr(response, "candidates", None) or [] + candidate = candidates[0] if candidates else None + finish_reason = getattr(candidate, "finish_reason", None) + # SDK sometimes exposes the enum name on `.name`, sometimes it's a string. + finish_name = getattr(finish_reason, "name", None) or ( + str(finish_reason) if finish_reason is not None else None + ) + # Strip the leading enum prefix if present (e.g. "FinishReason.SAFETY"). + if finish_name and "." in finish_name: + finish_name = finish_name.rsplit(".", 1)[-1] + + text = response.text or "" + + if finish_name in {"SAFETY", "RECITATION", "PROHIBITED_CONTENT", "SPII", "BLOCKLIST"}: + # Pull the first safety rating that actually blocked if present. + ratings = getattr(candidate, "safety_ratings", None) or [] + blocked = [ + getattr(r, "category", None) + for r in ratings + if getattr(r, "blocked", False) + ] + detail = ", ".join(str(c) for c in blocked if c) or None + logger.warning( + "Gemini response blocked (model=%s, reason=%s, detail=%s)", + gemini_model, + finish_name, + detail, + ) + raise ProviderBlockedError(reason=finish_name, detail=detail) + + if finish_name == "MAX_TOKENS": + logger.warning( + "Gemini response truncated at max_tokens (model=%s)", gemini_model + ) + raise ProviderTruncatedError(partial_text=text or None) + + if not text and finish_name not in (None, "STOP"): + # Unknown finish reason, empty text — surface instead of silent "". + raise ProviderError( + f"Gemini returned no content (finish_reason={finish_name})" + ) + + return text + + +def _wrap_gemini_call_error(err: Exception, gemini_model: str) -> ProviderError: + """Translate a raw Google SDK exception into a structured ProviderError. + + The SDK uses google.genai.errors.* but we avoid importing them at + top level to keep the provider optional. String-match the class + name instead. + """ + cls_name = type(err).__name__ + msg = str(err) or cls_name + if "Auth" in cls_name or "PermissionDenied" in cls_name or "Unauthenticated" in cls_name: + return ProviderAuthError(f"Gemini auth failed for {gemini_model}: {msg}") + if "ResourceExhausted" in cls_name or "RateLimit" in cls_name or "429" in msg: + return ProviderRateLimitError(f"Gemini rate-limited for {gemini_model}: {msg}") + return ProviderError(f"Gemini call failed for {gemini_model}: {msg}") + # Model mapping: Ollama model → Google Gemini equivalent OLLAMA_TO_GEMINI: dict[str, str] = { "gemma3:4b": "gemini-2.5-flash", @@ -129,13 +206,18 @@ class GoogleProvider(LLMProvider): logger.debug(f"Google Gemini request: {gemini_model}, messages: {len(contents)}") - response = await self.client.aio.models.generate_content( - model=gemini_model, - contents=contents, - config=gen_config, - ) + try: + response = await self.client.aio.models.generate_content( + model=gemini_model, + contents=contents, + config=gen_config, + ) + except ProviderError: + raise + except Exception as err: + raise _wrap_gemini_call_error(err, gemini_model) from err - content = response.text or "" + content = _unwrap_gemini_response(response, gemini_model) usage_meta = response.usage_metadata return ChatCompletionResponse( @@ -187,13 +269,22 @@ class GoogleProvider(LLMProvider): ], ) - async for chunk in await self.client.aio.models.generate_content_stream( - model=gemini_model, - contents=contents, - config=gen_config, - ): + last_chunk: Any = None + emitted_any_text = False + try: + stream = await self.client.aio.models.generate_content_stream( + model=gemini_model, + contents=contents, + config=gen_config, + ) + except Exception as err: + raise _wrap_gemini_call_error(err, gemini_model) from err + + async for chunk in stream: + last_chunk = chunk text = chunk.text if text: + emitted_any_text = True yield ChatCompletionStreamResponse( model=f"google/{gemini_model}", choices=[ @@ -204,6 +295,12 @@ class GoogleProvider(LLMProvider): ], ) + # Post-stream check: if the stream ended without emitting any text, + # surface the structured reason instead of quietly closing with an + # empty "stop". Matches _unwrap_gemini_response semantics. + if not emitted_any_text and last_chunk is not None: + _unwrap_gemini_response(last_chunk, gemini_model) + # Final chunk yield ChatCompletionStreamResponse( model=f"google/{gemini_model}", diff --git a/services/mana-llm/src/streaming/sse.py b/services/mana-llm/src/streaming/sse.py index 2bdc1177b..f87e04c64 100644 --- a/services/mana-llm/src/streaming/sse.py +++ b/services/mana-llm/src/streaming/sse.py @@ -6,6 +6,7 @@ from collections.abc import AsyncIterator from src.models import ChatCompletionRequest, ChatCompletionStreamResponse from src.providers import ProviderRouter +from src.providers.errors import ProviderError logger = logging.getLogger(__name__) @@ -29,9 +30,18 @@ async def stream_chat_completion( # Send final [DONE] marker yield {"data": "[DONE]"} + except ProviderError as e: + logger.warning(f"Streaming provider error: kind={e.kind} detail={e}") + error_data = { + "error": { + "message": str(e), + "type": e.kind, + } + } + yield {"data": json.dumps(error_data)} + yield {"data": "[DONE]"} except Exception as e: logger.error(f"Streaming error: {e}") - # Send error as SSE event error_data = { "error": { "message": str(e),