fix(mana-llm): surface Gemini finish_reason errors instead of returning ""

The google provider called response.text after a chat completion and passed the resulting string downstream unchanged. When Gemini's content filter, recitation guard, or max_tokens ceiling fired, response.text quietly returned "" — which the planner then reported as "no JSON block found", masking the real cause. Empirically this failed in 45 ms on a simple Quiz mission. Introduces providers/errors.py with a small ProviderError hierarchy (Blocked / Truncated / Auth / RateLimit / Capability). google.py now inspects response.candidates[0].finish_reason and raises the matching structured error; the non-streaming path maps it to 422/502/429 via a new except-branch in main.py, and the streaming path surfaces the kind as the SSE error type. Capability is wired but not yet used — it lands with the tool-schema passthrough in the next commit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 22:01:09 +02:00 · 2026-04-20 15:15:37 +02:00 · 2026-04-20 15:15:37 +02:00 · 4b8fede7fc
commit 4b8fede7fc
parent c34175afab
4 changed files with 207 additions and 12 deletions
--- a/services/mana-llm/src/main.py
+++ b/services/mana-llm/src/main.py
@ -21,6 +21,7 @@ from src.models import (
    ModelsResponse,
 )
 from src.providers import ProviderRouter
+from src.providers.errors import ProviderError
 from src.streaming import stream_chat_completion
 from src.utils.cache import close_redis
 from src.utils.metrics import get_metrics, record_llm_error, record_llm_request
@ -180,6 +181,15 @@ async def chat_completions(
        logger.error(f"Invalid request: {e}")
        record_llm_error(provider, model, "invalid_request")
        raise HTTPException(status_code=400, detail=str(e))
+    except ProviderError as e:
+        logger.warning(
+            f"Provider error on {provider}/{model}: kind={e.kind} detail={e}"
+        )
+        record_llm_error(provider, model, e.kind)
+        raise HTTPException(
+            status_code=e.http_status,
+            detail={"kind": e.kind, "message": str(e)},
+        )
    except Exception as e:
        logger.error(f"Chat completion failed: {e}")
        record_llm_error(provider, model, "server_error")
--- a/services/mana-llm/src/providers/errors.py
+++ b/services/mana-llm/src/providers/errors.py
@ -0,0 +1,78 @@
+"""Structured provider errors.
+
+These map to distinct HTTP status codes and metric labels so callers can
+distinguish between "model refused to answer" (blocked), "hit the token
+budget" (truncated), "auth is broken", "we were rate-limited", and
+"model doesn't support what we asked" (capability). The old behaviour of
+returning an empty-string response on content-filter hits silently
+corrupts downstream pipelines (e.g. the planner sees "" and reports
+"no JSON block found" — misleading).
+"""
+
+
+class ProviderError(Exception):
+    """Base class for structured provider errors."""
+
+    # Short stable identifier used in metrics and API responses.
+    kind: str = "unknown"
+    # Suggested HTTP status when this error bubbles out of an endpoint.
+    http_status: int = 502
+
+
+class ProviderBlockedError(ProviderError):
+    """The provider refused to return content (safety, recitation, …).
+
+    The model produced nothing usable because a content filter or policy
+    guardrail fired. Retry with the same inputs will fail again — the
+    caller needs to adjust the prompt or give the user a clear message.
+    """
+
+    kind = "blocked"
+    http_status = 422
+
+    def __init__(self, reason: str, detail: str | None = None):
+        self.reason = reason  # e.g. "SAFETY", "RECITATION"
+        self.detail = detail
+        msg = f"Provider blocked the response ({reason})"
+        if detail:
+            msg += f": {detail}"
+        super().__init__(msg)
+
+
+class ProviderTruncatedError(ProviderError):
+    """The response was cut off before completion (hit max_tokens)."""
+
+    kind = "truncated"
+    http_status = 502
+
+    def __init__(self, partial_text: str | None = None):
+        self.partial_text = partial_text
+        super().__init__(
+            "Provider truncated the response (max_tokens reached) — "
+            "re-run with a higher max_tokens or smaller input"
+        )
+
+
+class ProviderAuthError(ProviderError):
+    """Authentication against the upstream provider failed."""
+
+    kind = "auth"
+    http_status = 502  # not 401 — the client's auth is fine, *ours* isn't
+
+
+class ProviderRateLimitError(ProviderError):
+    """The upstream provider rate-limited us."""
+
+    kind = "rate_limit"
+    http_status = 429
+
+
+class ProviderCapabilityError(ProviderError):
+    """The requested feature is not supported by the chosen model.
+
+    Typically raised when a request asks for native tool-calling against
+    a model that does not support it. We refuse to silently degrade.
+    """
+
+    kind = "capability"
+    http_status = 400
--- a/services/mana-llm/src/providers/google.py
+++ b/services/mana-llm/src/providers/google.py
@ -24,9 +24,86 @@ from src.models import (
 )

 from .base import LLMProvider
+from .errors import (
+    ProviderAuthError,
+    ProviderBlockedError,
+    ProviderError,
+    ProviderRateLimitError,
+    ProviderTruncatedError,
+)

 logger = logging.getLogger(__name__)

+
+def _unwrap_gemini_response(response: Any, gemini_model: str) -> str:
+    """Validate a non-streaming Gemini response and return its text.
+
+    Raises a structured ProviderError if the response was blocked,
+    truncated, or otherwise produced no usable text. The SDK's
+    ``response.text`` accessor silently returns an empty string in all
+    of those cases, which downstream consumers (e.g. the planner
+    parser) cannot distinguish from a well-formed empty completion.
+    """
+    candidates = getattr(response, "candidates", None) or []
+    candidate = candidates[0] if candidates else None
+    finish_reason = getattr(candidate, "finish_reason", None)
+    # SDK sometimes exposes the enum name on `.name`, sometimes it's a string.
+    finish_name = getattr(finish_reason, "name", None) or (
+        str(finish_reason) if finish_reason is not None else None
+    )
+    # Strip the leading enum prefix if present (e.g. "FinishReason.SAFETY").
+    if finish_name and "." in finish_name:
+        finish_name = finish_name.rsplit(".", 1)[-1]
+
+    text = response.text or ""
+
+    if finish_name in {"SAFETY", "RECITATION", "PROHIBITED_CONTENT", "SPII", "BLOCKLIST"}:
+        # Pull the first safety rating that actually blocked if present.
+        ratings = getattr(candidate, "safety_ratings", None) or []
+        blocked = [
+            getattr(r, "category", None)
+            for r in ratings
+            if getattr(r, "blocked", False)
+        ]
+        detail = ", ".join(str(c) for c in blocked if c) or None
+        logger.warning(
+            "Gemini response blocked (model=%s, reason=%s, detail=%s)",
+            gemini_model,
+            finish_name,
+            detail,
+        )
+        raise ProviderBlockedError(reason=finish_name, detail=detail)
+
+    if finish_name == "MAX_TOKENS":
+        logger.warning(
+            "Gemini response truncated at max_tokens (model=%s)", gemini_model
+        )
+        raise ProviderTruncatedError(partial_text=text or None)
+
+    if not text and finish_name not in (None, "STOP"):
+        # Unknown finish reason, empty text — surface instead of silent "".
+        raise ProviderError(
+            f"Gemini returned no content (finish_reason={finish_name})"
+        )
+
+    return text
+
+
+def _wrap_gemini_call_error(err: Exception, gemini_model: str) -> ProviderError:
+    """Translate a raw Google SDK exception into a structured ProviderError.
+
+    The SDK uses google.genai.errors.* but we avoid importing them at
+    top level to keep the provider optional. String-match the class
+    name instead.
+    """
+    cls_name = type(err).__name__
+    msg = str(err) or cls_name
+    if "Auth" in cls_name or "PermissionDenied" in cls_name or "Unauthenticated" in cls_name:
+        return ProviderAuthError(f"Gemini auth failed for {gemini_model}: {msg}")
+    if "ResourceExhausted" in cls_name or "RateLimit" in cls_name or "429" in msg:
+        return ProviderRateLimitError(f"Gemini rate-limited for {gemini_model}: {msg}")
+    return ProviderError(f"Gemini call failed for {gemini_model}: {msg}")
+
 # Model mapping: Ollama model → Google Gemini equivalent
 OLLAMA_TO_GEMINI: dict[str, str] = {
    "gemma3:4b": "gemini-2.5-flash",
@ -129,13 +206,18 @@ class GoogleProvider(LLMProvider):

        logger.debug(f"Google Gemini request: {gemini_model}, messages: {len(contents)}")

-        response = await self.client.aio.models.generate_content(
-            model=gemini_model,
-            contents=contents,
-            config=gen_config,
-        )
+        try:
+            response = await self.client.aio.models.generate_content(
+                model=gemini_model,
+                contents=contents,
+                config=gen_config,
+            )
+        except ProviderError:
+            raise
+        except Exception as err:
+            raise _wrap_gemini_call_error(err, gemini_model) from err

-        content = response.text or ""
+        content = _unwrap_gemini_response(response, gemini_model)
        usage_meta = response.usage_metadata

        return ChatCompletionResponse(
@ -187,13 +269,22 @@ class GoogleProvider(LLMProvider):
            ],
        )

-        async for chunk in await self.client.aio.models.generate_content_stream(
-            model=gemini_model,
-            contents=contents,
-            config=gen_config,
-        ):
+        last_chunk: Any = None
+        emitted_any_text = False
+        try:
+            stream = await self.client.aio.models.generate_content_stream(
+                model=gemini_model,
+                contents=contents,
+                config=gen_config,
+            )
+        except Exception as err:
+            raise _wrap_gemini_call_error(err, gemini_model) from err
+
+        async for chunk in stream:
+            last_chunk = chunk
            text = chunk.text
            if text:
+                emitted_any_text = True
                yield ChatCompletionStreamResponse(
                    model=f"google/{gemini_model}",
                    choices=[
@ -204,6 +295,12 @@ class GoogleProvider(LLMProvider):
                    ],
                )

+        # Post-stream check: if the stream ended without emitting any text,
+        # surface the structured reason instead of quietly closing with an
+        # empty "stop". Matches _unwrap_gemini_response semantics.
+        if not emitted_any_text and last_chunk is not None:
+            _unwrap_gemini_response(last_chunk, gemini_model)
+
        # Final chunk
        yield ChatCompletionStreamResponse(
            model=f"google/{gemini_model}",
--- a/services/mana-llm/src/streaming/sse.py
+++ b/services/mana-llm/src/streaming/sse.py
@ -6,6 +6,7 @@ from collections.abc import AsyncIterator

 from src.models import ChatCompletionRequest, ChatCompletionStreamResponse
 from src.providers import ProviderRouter
+from src.providers.errors import ProviderError

 logger = logging.getLogger(__name__)

@ -29,9 +30,18 @@ async def stream_chat_completion(
        # Send final [DONE] marker
        yield {"data": "[DONE]"}

+    except ProviderError as e:
+        logger.warning(f"Streaming provider error: kind={e.kind} detail={e}")
+        error_data = {
+            "error": {
+                "message": str(e),
+                "type": e.kind,
+            }
+        }
+        yield {"data": json.dumps(error_data)}
+        yield {"data": "[DONE]"}
    except Exception as e:
        logger.error(f"Streaming error: {e}")
-        # Send error as SSE event
        error_data = {
            "error": {
                "message": str(e),