From 4b8fede7fc7acc2e589ca5a984b7c43173d0e2df Mon Sep 17 00:00:00 2001
From: Till JS <tills95@gmail.com>
Date: Mon, 20 Apr 2026 15:15:37 +0200
Subject: [PATCH] fix(mana-llm): surface Gemini finish_reason errors instead of
 returning ""
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The google provider called response.text after a chat completion and
passed the resulting string downstream unchanged. When Gemini's content
filter, recitation guard, or max_tokens ceiling fired, response.text
quietly returned "" — which the planner then reported as "no JSON block
found", masking the real cause. Empirically this failed in 45 ms on a
simple Quiz mission.

Introduces providers/errors.py with a small ProviderError hierarchy
(Blocked / Truncated / Auth / RateLimit / Capability). google.py now
inspects response.candidates[0].finish_reason and raises the matching
structured error; the non-streaming path maps it to 422/502/429 via a
new except-branch in main.py, and the streaming path surfaces the kind
as the SSE error type. Capability is wired but not yet used — it lands
with the tool-schema passthrough in the next commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 services/mana-llm/src/main.py             |  10 ++
 services/mana-llm/src/providers/errors.py |  78 ++++++++++++++
 services/mana-llm/src/providers/google.py | 119 ++++++++++++++++++++--
 services/mana-llm/src/streaming/sse.py    |  12 ++-
 4 files changed, 207 insertions(+), 12 deletions(-)
 create mode 100644 services/mana-llm/src/providers/errors.py

diff --git a/services/mana-llm/src/main.py b/services/mana-llm/src/main.py
index b13519241..f04c4aaea 100644
--- a/services/mana-llm/src/main.py
+++ b/services/mana-llm/src/main.py
@@ -21,6 +21,7 @@ from src.models import (
     ModelsResponse,
 )
 from src.providers import ProviderRouter
+from src.providers.errors import ProviderError
 from src.streaming import stream_chat_completion
 from src.utils.cache import close_redis
 from src.utils.metrics import get_metrics, record_llm_error, record_llm_request
@@ -180,6 +181,15 @@ async def chat_completions(
         logger.error(f"Invalid request: {e}")
         record_llm_error(provider, model, "invalid_request")
         raise HTTPException(status_code=400, detail=str(e))
+    except ProviderError as e:
+        logger.warning(
+            f"Provider error on {provider}/{model}: kind={e.kind} detail={e}"
+        )
+        record_llm_error(provider, model, e.kind)
+        raise HTTPException(
+            status_code=e.http_status,
+            detail={"kind": e.kind, "message": str(e)},
+        )
     except Exception as e:
         logger.error(f"Chat completion failed: {e}")
         record_llm_error(provider, model, "server_error")
diff --git a/services/mana-llm/src/providers/errors.py b/services/mana-llm/src/providers/errors.py
new file mode 100644
index 000000000..f63efa17a
--- /dev/null
+++ b/services/mana-llm/src/providers/errors.py
@@ -0,0 +1,78 @@
+"""Structured provider errors.
+
+These map to distinct HTTP status codes and metric labels so callers can
+distinguish between "model refused to answer" (blocked), "hit the token
+budget" (truncated), "auth is broken", "we were rate-limited", and
+"model doesn't support what we asked" (capability). The old behaviour of
+returning an empty-string response on content-filter hits silently
+corrupts downstream pipelines (e.g. the planner sees "" and reports
+"no JSON block found" — misleading).
+"""
+
+
+class ProviderError(Exception):
+    """Base class for structured provider errors."""
+
+    # Short stable identifier used in metrics and API responses.
+    kind: str = "unknown"
+    # Suggested HTTP status when this error bubbles out of an endpoint.
+    http_status: int = 502
+
+
+class ProviderBlockedError(ProviderError):
+    """The provider refused to return content (safety, recitation, …).
+
+    The model produced nothing usable because a content filter or policy
+    guardrail fired. Retry with the same inputs will fail again — the
+    caller needs to adjust the prompt or give the user a clear message.
+    """
+
+    kind = "blocked"
+    http_status = 422
+
+    def __init__(self, reason: str, detail: str | None = None):
+        self.reason = reason  # e.g. "SAFETY", "RECITATION"
+        self.detail = detail
+        msg = f"Provider blocked the response ({reason})"
+        if detail:
+            msg += f": {detail}"
+        super().__init__(msg)
+
+
+class ProviderTruncatedError(ProviderError):
+    """The response was cut off before completion (hit max_tokens)."""
+
+    kind = "truncated"
+    http_status = 502
+
+    def __init__(self, partial_text: str | None = None):
+        self.partial_text = partial_text
+        super().__init__(
+            "Provider truncated the response (max_tokens reached) — "
+            "re-run with a higher max_tokens or smaller input"
+        )
+
+
+class ProviderAuthError(ProviderError):
+    """Authentication against the upstream provider failed."""
+
+    kind = "auth"
+    http_status = 502  # not 401 — the client's auth is fine, *ours* isn't
+
+
+class ProviderRateLimitError(ProviderError):
+    """The upstream provider rate-limited us."""
+
+    kind = "rate_limit"
+    http_status = 429
+
+
+class ProviderCapabilityError(ProviderError):
+    """The requested feature is not supported by the chosen model.
+
+    Typically raised when a request asks for native tool-calling against
+    a model that does not support it. We refuse to silently degrade.
+    """
+
+    kind = "capability"
+    http_status = 400
diff --git a/services/mana-llm/src/providers/google.py b/services/mana-llm/src/providers/google.py
index 43acfc695..1b204b2ee 100644
--- a/services/mana-llm/src/providers/google.py
+++ b/services/mana-llm/src/providers/google.py
@@ -24,9 +24,86 @@ from src.models import (
 )
 
 from .base import LLMProvider
+from .errors import (
+    ProviderAuthError,
+    ProviderBlockedError,
+    ProviderError,
+    ProviderRateLimitError,
+    ProviderTruncatedError,
+)
 
 logger = logging.getLogger(__name__)
 
+
+def _unwrap_gemini_response(response: Any, gemini_model: str) -> str:
+    """Validate a non-streaming Gemini response and return its text.
+
+    Raises a structured ProviderError if the response was blocked,
+    truncated, or otherwise produced no usable text. The SDK's
+    ``response.text`` accessor silently returns an empty string in all
+    of those cases, which downstream consumers (e.g. the planner
+    parser) cannot distinguish from a well-formed empty completion.
+    """
+    candidates = getattr(response, "candidates", None) or []
+    candidate = candidates[0] if candidates else None
+    finish_reason = getattr(candidate, "finish_reason", None)
+    # SDK sometimes exposes the enum name on `.name`, sometimes it's a string.
+    finish_name = getattr(finish_reason, "name", None) or (
+        str(finish_reason) if finish_reason is not None else None
+    )
+    # Strip the leading enum prefix if present (e.g. "FinishReason.SAFETY").
+    if finish_name and "." in finish_name:
+        finish_name = finish_name.rsplit(".", 1)[-1]
+
+    text = response.text or ""
+
+    if finish_name in {"SAFETY", "RECITATION", "PROHIBITED_CONTENT", "SPII", "BLOCKLIST"}:
+        # Pull the first safety rating that actually blocked if present.
+        ratings = getattr(candidate, "safety_ratings", None) or []
+        blocked = [
+            getattr(r, "category", None)
+            for r in ratings
+            if getattr(r, "blocked", False)
+        ]
+        detail = ", ".join(str(c) for c in blocked if c) or None
+        logger.warning(
+            "Gemini response blocked (model=%s, reason=%s, detail=%s)",
+            gemini_model,
+            finish_name,
+            detail,
+        )
+        raise ProviderBlockedError(reason=finish_name, detail=detail)
+
+    if finish_name == "MAX_TOKENS":
+        logger.warning(
+            "Gemini response truncated at max_tokens (model=%s)", gemini_model
+        )
+        raise ProviderTruncatedError(partial_text=text or None)
+
+    if not text and finish_name not in (None, "STOP"):
+        # Unknown finish reason, empty text — surface instead of silent "".
+        raise ProviderError(
+            f"Gemini returned no content (finish_reason={finish_name})"
+        )
+
+    return text
+
+
+def _wrap_gemini_call_error(err: Exception, gemini_model: str) -> ProviderError:
+    """Translate a raw Google SDK exception into a structured ProviderError.
+
+    The SDK uses google.genai.errors.* but we avoid importing them at
+    top level to keep the provider optional. String-match the class
+    name instead.
+    """
+    cls_name = type(err).__name__
+    msg = str(err) or cls_name
+    if "Auth" in cls_name or "PermissionDenied" in cls_name or "Unauthenticated" in cls_name:
+        return ProviderAuthError(f"Gemini auth failed for {gemini_model}: {msg}")
+    if "ResourceExhausted" in cls_name or "RateLimit" in cls_name or "429" in msg:
+        return ProviderRateLimitError(f"Gemini rate-limited for {gemini_model}: {msg}")
+    return ProviderError(f"Gemini call failed for {gemini_model}: {msg}")
+
 # Model mapping: Ollama model → Google Gemini equivalent
 OLLAMA_TO_GEMINI: dict[str, str] = {
     "gemma3:4b": "gemini-2.5-flash",
@@ -129,13 +206,18 @@ class GoogleProvider(LLMProvider):
 
         logger.debug(f"Google Gemini request: {gemini_model}, messages: {len(contents)}")
 
-        response = await self.client.aio.models.generate_content(
-            model=gemini_model,
-            contents=contents,
-            config=gen_config,
-        )
+        try:
+            response = await self.client.aio.models.generate_content(
+                model=gemini_model,
+                contents=contents,
+                config=gen_config,
+            )
+        except ProviderError:
+            raise
+        except Exception as err:
+            raise _wrap_gemini_call_error(err, gemini_model) from err
 
-        content = response.text or ""
+        content = _unwrap_gemini_response(response, gemini_model)
         usage_meta = response.usage_metadata
 
         return ChatCompletionResponse(
@@ -187,13 +269,22 @@ class GoogleProvider(LLMProvider):
             ],
         )
 
-        async for chunk in await self.client.aio.models.generate_content_stream(
-            model=gemini_model,
-            contents=contents,
-            config=gen_config,
-        ):
+        last_chunk: Any = None
+        emitted_any_text = False
+        try:
+            stream = await self.client.aio.models.generate_content_stream(
+                model=gemini_model,
+                contents=contents,
+                config=gen_config,
+            )
+        except Exception as err:
+            raise _wrap_gemini_call_error(err, gemini_model) from err
+
+        async for chunk in stream:
+            last_chunk = chunk
             text = chunk.text
             if text:
+                emitted_any_text = True
                 yield ChatCompletionStreamResponse(
                     model=f"google/{gemini_model}",
                     choices=[
@@ -204,6 +295,12 @@ class GoogleProvider(LLMProvider):
                     ],
                 )
 
+        # Post-stream check: if the stream ended without emitting any text,
+        # surface the structured reason instead of quietly closing with an
+        # empty "stop". Matches _unwrap_gemini_response semantics.
+        if not emitted_any_text and last_chunk is not None:
+            _unwrap_gemini_response(last_chunk, gemini_model)
+
         # Final chunk
         yield ChatCompletionStreamResponse(
             model=f"google/{gemini_model}",
diff --git a/services/mana-llm/src/streaming/sse.py b/services/mana-llm/src/streaming/sse.py
index 2bdc1177b..f87e04c64 100644
--- a/services/mana-llm/src/streaming/sse.py
+++ b/services/mana-llm/src/streaming/sse.py
@@ -6,6 +6,7 @@ from collections.abc import AsyncIterator
 
 from src.models import ChatCompletionRequest, ChatCompletionStreamResponse
 from src.providers import ProviderRouter
+from src.providers.errors import ProviderError
 
 logger = logging.getLogger(__name__)
 
@@ -29,9 +30,18 @@ async def stream_chat_completion(
         # Send final [DONE] marker
         yield {"data": "[DONE]"}
 
+    except ProviderError as e:
+        logger.warning(f"Streaming provider error: kind={e.kind} detail={e}")
+        error_data = {
+            "error": {
+                "message": str(e),
+                "type": e.kind,
+            }
+        }
+        yield {"data": json.dumps(error_data)}
+        yield {"data": "[DONE]"}
     except Exception as e:
         logger.error(f"Streaming error: {e}")
-        # Send error as SSE event
         error_data = {
             "error": {
                 "message": str(e),