mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-17 12:29:40 +02:00
Reasoning-style models (Gemma 4 E4B is the first one we use, but DeepSeek R1, Gemini 2.5 thinking, etc. behave the same way) split their output into two fields: - message.content — the final answer - message.reasoning — the chain-of-thought leading up to it When the model is given too few max_tokens to finish reasoning AND emit content, the response comes back with content="" and reasoning populated with the half-finished thought. Verified empirically with gemma4:e4b and `max_tokens: 10` on a "Sage Hi auf Deutsch in einem Wort" prompt — content was "" while reasoning had "Here's a thinking process to..." (cut off mid-thought). For the title task this rarely matters because the system prompt is directive enough to skip the thinking phase (verified: same gemma4: e4b returns clean 7-token titles like "Sonnenstrahlen genießen heute" with the standard system prompt + max_tokens 32). But it's a real failure mode for any future task that uses a less-directive prompt or hits a longer reasoning chain. Defensive fix: prefer message.content first, fall back to message.reasoning if content is empty. The fallback is a string-or- nothing operation, no semantic interpretation — if the reasoning field happens to contain a usable answer fragment, the caller's cleanup chain (e.g. generateTitleTask's strip-quotes-and-dots pipeline) will normalize it. If it's truly half-finished thought, the caller's runRules fallback still kicks in via the existing empty-result detection. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
152 lines
5.7 KiB
TypeScript
152 lines
5.7 KiB
TypeScript
/**
|
|
* Shared HTTP transport for the mana-server and cloud backends.
|
|
*
|
|
* Both tiers POST to the same OpenAI-compatible endpoint on
|
|
* services/mana-llm — they only differ in the `model:` string they
|
|
* send (which selects which provider mana-llm internally routes to).
|
|
*
|
|
* The endpoint is `/v1/chat/completions` and the wire format is
|
|
* straight OpenAI SSE: `data: {…}\n\n` lines, terminated by
|
|
* `data: [DONE]`. The hand-rolled parser is the same shape as the
|
|
* existing playground client (apps/mana/apps/web/src/lib/modules/
|
|
* playground/llm.ts) so the two consumers stay aligned and can be
|
|
* unified later if we want.
|
|
*/
|
|
|
|
import { BackendUnreachableError, ProviderBlockedError } from '../errors';
|
|
import type { LlmTier } from '../tiers';
|
|
import type { GenerateResult, LlmTaskRequest } from '../types';
|
|
|
|
const DEFAULT_LLM_URL = 'http://localhost:3025';
|
|
|
|
/** Resolve the mana-llm base URL from the window-injected env, falling
|
|
* back to localhost. Mirrors the playground client pattern. */
|
|
export function resolveLlmBaseUrl(): string {
|
|
if (typeof window !== 'undefined') {
|
|
const fromWindow = (window as unknown as { __PUBLIC_MANA_LLM_URL__?: string })
|
|
.__PUBLIC_MANA_LLM_URL__;
|
|
if (fromWindow) return fromWindow.replace(/\/$/, '');
|
|
}
|
|
return DEFAULT_LLM_URL;
|
|
}
|
|
|
|
/**
|
|
* Send a chat completion to mana-llm and return the result.
|
|
*
|
|
* Implementation notes:
|
|
*
|
|
* - We use the NON-streaming endpoint (`stream: false`). Curl tests
|
|
* from the same hostname showed that mana-llm's streaming endpoint
|
|
* works perfectly when called from outside the browser, but the
|
|
* browser receives `totalFrames=0` (an empty response body) for
|
|
* reasons that almost certainly trace back to CORS + credentials
|
|
* + streaming-body interactions. Non-streaming is a single JSON
|
|
* response, much friendlier to the browser fetch API.
|
|
*
|
|
* - We do NOT pass `credentials: 'include'`. The mana-llm service
|
|
* doesn't require user auth (the API key middleware accepts
|
|
* anonymous requests), and `credentials: 'include'` plus
|
|
* `Access-Control-Allow-Origin: *` is one of the patterns that
|
|
* silently breaks the response body in browsers. Verified by
|
|
* comparing curl-from-server (no creds, works) vs browser fetch
|
|
* (with creds, empty body).
|
|
*
|
|
* - For tasks that registered an `onToken` callback (legacy chat-
|
|
* style streaming UX), we fire it ONCE with the full content at
|
|
* the end. That's a degraded streaming experience, but no current
|
|
* shared-llm caller actually consumes the per-token stream — the
|
|
* queue + watcher model only cares about the final result. The
|
|
* playground module uses its own client (apps/.../modules/
|
|
* playground/llm.ts) which keeps real streaming for live UX.
|
|
*
|
|
* `tier` is only used for error tagging — both 'mana-server' and
|
|
* 'cloud' call the same endpoint with different model strings.
|
|
*/
|
|
export async function callManaLlmStreaming(
|
|
tier: Exclude<LlmTier, 'none' | 'browser'>,
|
|
model: string,
|
|
req: LlmTaskRequest
|
|
): Promise<GenerateResult> {
|
|
const url = `${resolveLlmBaseUrl()}/v1/chat/completions`;
|
|
const start = performance.now();
|
|
|
|
let res: Response;
|
|
try {
|
|
res = await fetch(url, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
model,
|
|
messages: req.messages,
|
|
temperature: req.temperature ?? 0.7,
|
|
max_tokens: req.maxTokens ?? 1024,
|
|
stream: false,
|
|
}),
|
|
});
|
|
} catch (err) {
|
|
// Network failure — DNS, refused connection, CORS preflight, etc.
|
|
throw new BackendUnreachableError(
|
|
tier,
|
|
undefined,
|
|
err instanceof Error ? err.message : String(err)
|
|
);
|
|
}
|
|
|
|
if (!res.ok) {
|
|
const text = await res.text().catch(() => '');
|
|
// 451 = upstream blocked content (we use this convention; Gemini
|
|
// safety blocks are mapped to 451 in mana-llm's google provider).
|
|
if (res.status === 451 || /safety|blocked|filter/i.test(text)) {
|
|
throw new ProviderBlockedError(tier, text || `HTTP ${res.status}`);
|
|
}
|
|
throw new BackendUnreachableError(tier, res.status, text);
|
|
}
|
|
|
|
let json: {
|
|
choices?: Array<{
|
|
message?: { content?: string; reasoning?: string };
|
|
text?: string;
|
|
}>;
|
|
usage?: { prompt_tokens?: number; completion_tokens?: number };
|
|
};
|
|
try {
|
|
json = await res.json();
|
|
} catch (err) {
|
|
console.warn(`[shared-llm:${tier}] failed to parse response JSON`, err);
|
|
throw new BackendUnreachableError(tier, res.status, 'invalid JSON response');
|
|
}
|
|
|
|
// Field ordering: prefer the canonical OpenAI `message.content` first.
|
|
// If that's empty AND `message.reasoning` is set, fall back to it —
|
|
// reasoning models like Gemma 4 emit their thought process there
|
|
// when given too few tokens to also produce a final answer (we hit
|
|
// this with max_tokens=10 / no system prompt: content was "" while
|
|
// reasoning had the half-finished thought). For our title task this
|
|
// rarely happens because the system prompt is directive, but the
|
|
// fallback is cheap and protects against future tasks that might
|
|
// trigger longer reasoning chains.
|
|
const choice = json.choices?.[0];
|
|
const content = choice?.message?.content ?? choice?.message?.reasoning ?? choice?.text ?? '';
|
|
|
|
if (!content) {
|
|
console.warn(`[shared-llm:${tier}] empty completion content`, { model, json });
|
|
}
|
|
|
|
// One-shot "streaming" for any caller that registered onToken: emit
|
|
// the whole content as a single chunk at the end. The current
|
|
// orchestrator + queue model never reads tokens incrementally for
|
|
// remote tiers anyway.
|
|
if (content && req.onToken) {
|
|
req.onToken(content);
|
|
}
|
|
|
|
return {
|
|
content,
|
|
usage: {
|
|
promptTokens: json.usage?.prompt_tokens ?? 0,
|
|
completionTokens: json.usage?.completion_tokens ?? 0,
|
|
totalTokens: (json.usage?.prompt_tokens ?? 0) + (json.usage?.completion_tokens ?? 0),
|
|
},
|
|
latencyMs: Math.round(performance.now() - start),
|
|
};
|
|
}
|