mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 21:01:08 +02:00
fix(mana-llm): surface Gemini finish_reason errors instead of returning ""
The google provider called response.text after a chat completion and passed the resulting string downstream unchanged. When Gemini's content filter, recitation guard, or max_tokens ceiling fired, response.text quietly returned "" — which the planner then reported as "no JSON block found", masking the real cause. Empirically this failed in 45 ms on a simple Quiz mission. Introduces providers/errors.py with a small ProviderError hierarchy (Blocked / Truncated / Auth / RateLimit / Capability). google.py now inspects response.candidates[0].finish_reason and raises the matching structured error; the non-streaming path maps it to 422/502/429 via a new except-branch in main.py, and the streaming path surfaces the kind as the SSE error type. Capability is wired but not yet used — it lands with the tool-schema passthrough in the next commit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c34175afab
commit
4b8fede7fc
4 changed files with 207 additions and 12 deletions
|
|
@ -21,6 +21,7 @@ from src.models import (
|
|||
ModelsResponse,
|
||||
)
|
||||
from src.providers import ProviderRouter
|
||||
from src.providers.errors import ProviderError
|
||||
from src.streaming import stream_chat_completion
|
||||
from src.utils.cache import close_redis
|
||||
from src.utils.metrics import get_metrics, record_llm_error, record_llm_request
|
||||
|
|
@ -180,6 +181,15 @@ async def chat_completions(
|
|||
logger.error(f"Invalid request: {e}")
|
||||
record_llm_error(provider, model, "invalid_request")
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except ProviderError as e:
|
||||
logger.warning(
|
||||
f"Provider error on {provider}/{model}: kind={e.kind} detail={e}"
|
||||
)
|
||||
record_llm_error(provider, model, e.kind)
|
||||
raise HTTPException(
|
||||
status_code=e.http_status,
|
||||
detail={"kind": e.kind, "message": str(e)},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Chat completion failed: {e}")
|
||||
record_llm_error(provider, model, "server_error")
|
||||
|
|
|
|||
78
services/mana-llm/src/providers/errors.py
Normal file
78
services/mana-llm/src/providers/errors.py
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
"""Structured provider errors.
|
||||
|
||||
These map to distinct HTTP status codes and metric labels so callers can
|
||||
distinguish between "model refused to answer" (blocked), "hit the token
|
||||
budget" (truncated), "auth is broken", "we were rate-limited", and
|
||||
"model doesn't support what we asked" (capability). The old behaviour of
|
||||
returning an empty-string response on content-filter hits silently
|
||||
corrupts downstream pipelines (e.g. the planner sees "" and reports
|
||||
"no JSON block found" — misleading).
|
||||
"""
|
||||
|
||||
|
||||
class ProviderError(Exception):
|
||||
"""Base class for structured provider errors."""
|
||||
|
||||
# Short stable identifier used in metrics and API responses.
|
||||
kind: str = "unknown"
|
||||
# Suggested HTTP status when this error bubbles out of an endpoint.
|
||||
http_status: int = 502
|
||||
|
||||
|
||||
class ProviderBlockedError(ProviderError):
|
||||
"""The provider refused to return content (safety, recitation, …).
|
||||
|
||||
The model produced nothing usable because a content filter or policy
|
||||
guardrail fired. Retry with the same inputs will fail again — the
|
||||
caller needs to adjust the prompt or give the user a clear message.
|
||||
"""
|
||||
|
||||
kind = "blocked"
|
||||
http_status = 422
|
||||
|
||||
def __init__(self, reason: str, detail: str | None = None):
|
||||
self.reason = reason # e.g. "SAFETY", "RECITATION"
|
||||
self.detail = detail
|
||||
msg = f"Provider blocked the response ({reason})"
|
||||
if detail:
|
||||
msg += f": {detail}"
|
||||
super().__init__(msg)
|
||||
|
||||
|
||||
class ProviderTruncatedError(ProviderError):
|
||||
"""The response was cut off before completion (hit max_tokens)."""
|
||||
|
||||
kind = "truncated"
|
||||
http_status = 502
|
||||
|
||||
def __init__(self, partial_text: str | None = None):
|
||||
self.partial_text = partial_text
|
||||
super().__init__(
|
||||
"Provider truncated the response (max_tokens reached) — "
|
||||
"re-run with a higher max_tokens or smaller input"
|
||||
)
|
||||
|
||||
|
||||
class ProviderAuthError(ProviderError):
|
||||
"""Authentication against the upstream provider failed."""
|
||||
|
||||
kind = "auth"
|
||||
http_status = 502 # not 401 — the client's auth is fine, *ours* isn't
|
||||
|
||||
|
||||
class ProviderRateLimitError(ProviderError):
|
||||
"""The upstream provider rate-limited us."""
|
||||
|
||||
kind = "rate_limit"
|
||||
http_status = 429
|
||||
|
||||
|
||||
class ProviderCapabilityError(ProviderError):
|
||||
"""The requested feature is not supported by the chosen model.
|
||||
|
||||
Typically raised when a request asks for native tool-calling against
|
||||
a model that does not support it. We refuse to silently degrade.
|
||||
"""
|
||||
|
||||
kind = "capability"
|
||||
http_status = 400
|
||||
|
|
@ -24,9 +24,86 @@ from src.models import (
|
|||
)
|
||||
|
||||
from .base import LLMProvider
|
||||
from .errors import (
|
||||
ProviderAuthError,
|
||||
ProviderBlockedError,
|
||||
ProviderError,
|
||||
ProviderRateLimitError,
|
||||
ProviderTruncatedError,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _unwrap_gemini_response(response: Any, gemini_model: str) -> str:
|
||||
"""Validate a non-streaming Gemini response and return its text.
|
||||
|
||||
Raises a structured ProviderError if the response was blocked,
|
||||
truncated, or otherwise produced no usable text. The SDK's
|
||||
``response.text`` accessor silently returns an empty string in all
|
||||
of those cases, which downstream consumers (e.g. the planner
|
||||
parser) cannot distinguish from a well-formed empty completion.
|
||||
"""
|
||||
candidates = getattr(response, "candidates", None) or []
|
||||
candidate = candidates[0] if candidates else None
|
||||
finish_reason = getattr(candidate, "finish_reason", None)
|
||||
# SDK sometimes exposes the enum name on `.name`, sometimes it's a string.
|
||||
finish_name = getattr(finish_reason, "name", None) or (
|
||||
str(finish_reason) if finish_reason is not None else None
|
||||
)
|
||||
# Strip the leading enum prefix if present (e.g. "FinishReason.SAFETY").
|
||||
if finish_name and "." in finish_name:
|
||||
finish_name = finish_name.rsplit(".", 1)[-1]
|
||||
|
||||
text = response.text or ""
|
||||
|
||||
if finish_name in {"SAFETY", "RECITATION", "PROHIBITED_CONTENT", "SPII", "BLOCKLIST"}:
|
||||
# Pull the first safety rating that actually blocked if present.
|
||||
ratings = getattr(candidate, "safety_ratings", None) or []
|
||||
blocked = [
|
||||
getattr(r, "category", None)
|
||||
for r in ratings
|
||||
if getattr(r, "blocked", False)
|
||||
]
|
||||
detail = ", ".join(str(c) for c in blocked if c) or None
|
||||
logger.warning(
|
||||
"Gemini response blocked (model=%s, reason=%s, detail=%s)",
|
||||
gemini_model,
|
||||
finish_name,
|
||||
detail,
|
||||
)
|
||||
raise ProviderBlockedError(reason=finish_name, detail=detail)
|
||||
|
||||
if finish_name == "MAX_TOKENS":
|
||||
logger.warning(
|
||||
"Gemini response truncated at max_tokens (model=%s)", gemini_model
|
||||
)
|
||||
raise ProviderTruncatedError(partial_text=text or None)
|
||||
|
||||
if not text and finish_name not in (None, "STOP"):
|
||||
# Unknown finish reason, empty text — surface instead of silent "".
|
||||
raise ProviderError(
|
||||
f"Gemini returned no content (finish_reason={finish_name})"
|
||||
)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def _wrap_gemini_call_error(err: Exception, gemini_model: str) -> ProviderError:
|
||||
"""Translate a raw Google SDK exception into a structured ProviderError.
|
||||
|
||||
The SDK uses google.genai.errors.* but we avoid importing them at
|
||||
top level to keep the provider optional. String-match the class
|
||||
name instead.
|
||||
"""
|
||||
cls_name = type(err).__name__
|
||||
msg = str(err) or cls_name
|
||||
if "Auth" in cls_name or "PermissionDenied" in cls_name or "Unauthenticated" in cls_name:
|
||||
return ProviderAuthError(f"Gemini auth failed for {gemini_model}: {msg}")
|
||||
if "ResourceExhausted" in cls_name or "RateLimit" in cls_name or "429" in msg:
|
||||
return ProviderRateLimitError(f"Gemini rate-limited for {gemini_model}: {msg}")
|
||||
return ProviderError(f"Gemini call failed for {gemini_model}: {msg}")
|
||||
|
||||
# Model mapping: Ollama model → Google Gemini equivalent
|
||||
OLLAMA_TO_GEMINI: dict[str, str] = {
|
||||
"gemma3:4b": "gemini-2.5-flash",
|
||||
|
|
@ -129,13 +206,18 @@ class GoogleProvider(LLMProvider):
|
|||
|
||||
logger.debug(f"Google Gemini request: {gemini_model}, messages: {len(contents)}")
|
||||
|
||||
response = await self.client.aio.models.generate_content(
|
||||
model=gemini_model,
|
||||
contents=contents,
|
||||
config=gen_config,
|
||||
)
|
||||
try:
|
||||
response = await self.client.aio.models.generate_content(
|
||||
model=gemini_model,
|
||||
contents=contents,
|
||||
config=gen_config,
|
||||
)
|
||||
except ProviderError:
|
||||
raise
|
||||
except Exception as err:
|
||||
raise _wrap_gemini_call_error(err, gemini_model) from err
|
||||
|
||||
content = response.text or ""
|
||||
content = _unwrap_gemini_response(response, gemini_model)
|
||||
usage_meta = response.usage_metadata
|
||||
|
||||
return ChatCompletionResponse(
|
||||
|
|
@ -187,13 +269,22 @@ class GoogleProvider(LLMProvider):
|
|||
],
|
||||
)
|
||||
|
||||
async for chunk in await self.client.aio.models.generate_content_stream(
|
||||
model=gemini_model,
|
||||
contents=contents,
|
||||
config=gen_config,
|
||||
):
|
||||
last_chunk: Any = None
|
||||
emitted_any_text = False
|
||||
try:
|
||||
stream = await self.client.aio.models.generate_content_stream(
|
||||
model=gemini_model,
|
||||
contents=contents,
|
||||
config=gen_config,
|
||||
)
|
||||
except Exception as err:
|
||||
raise _wrap_gemini_call_error(err, gemini_model) from err
|
||||
|
||||
async for chunk in stream:
|
||||
last_chunk = chunk
|
||||
text = chunk.text
|
||||
if text:
|
||||
emitted_any_text = True
|
||||
yield ChatCompletionStreamResponse(
|
||||
model=f"google/{gemini_model}",
|
||||
choices=[
|
||||
|
|
@ -204,6 +295,12 @@ class GoogleProvider(LLMProvider):
|
|||
],
|
||||
)
|
||||
|
||||
# Post-stream check: if the stream ended without emitting any text,
|
||||
# surface the structured reason instead of quietly closing with an
|
||||
# empty "stop". Matches _unwrap_gemini_response semantics.
|
||||
if not emitted_any_text and last_chunk is not None:
|
||||
_unwrap_gemini_response(last_chunk, gemini_model)
|
||||
|
||||
# Final chunk
|
||||
yield ChatCompletionStreamResponse(
|
||||
model=f"google/{gemini_model}",
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from collections.abc import AsyncIterator
|
|||
|
||||
from src.models import ChatCompletionRequest, ChatCompletionStreamResponse
|
||||
from src.providers import ProviderRouter
|
||||
from src.providers.errors import ProviderError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -29,9 +30,18 @@ async def stream_chat_completion(
|
|||
# Send final [DONE] marker
|
||||
yield {"data": "[DONE]"}
|
||||
|
||||
except ProviderError as e:
|
||||
logger.warning(f"Streaming provider error: kind={e.kind} detail={e}")
|
||||
error_data = {
|
||||
"error": {
|
||||
"message": str(e),
|
||||
"type": e.kind,
|
||||
}
|
||||
}
|
||||
yield {"data": json.dumps(error_data)}
|
||||
yield {"data": "[DONE]"}
|
||||
except Exception as e:
|
||||
logger.error(f"Streaming error: {e}")
|
||||
# Send error as SSE event
|
||||
error_data = {
|
||||
"error": {
|
||||
"message": str(e),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue