mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-16 20:59:41 +02:00
Diagnosis from the user's last test pinpointed the bug: mana-llm
returns totalFrames=0 (no SSE frames at all) when called from the
browser, but works perfectly when called via curl from the same host
with the same payload. Two compounding causes:
1. credentials: 'include' in our fetch combined with mana-llm's
CORS headers silently breaks the response body. This is the
classic "Access-Control-Allow-Origin: * + Allow-Credentials: true"
mismatch — browsers reject the response per spec but report it
as a 0-byte success rather than an error.
2. Streaming over CORS adds a second layer of fragility. Even if
credentials weren't an issue, the browser fetch API's response
body for SSE under CORS depends on a specific combination of
server headers we evidently don't have.
Fix: drop both the streaming AND the credentials.
- stream: false in the request body. Single JSON response per call,
much friendlier to the browser fetch API.
- No `credentials` field at all (default 'same-origin' for cross-
origin requests = don't send cookies). mana-llm's API key
middleware accepts anonymous requests, so we don't need to send
any auth context.
- Parse the response as `await res.json()` instead of streaming
SSE chunks. Pull `choice.message.content` (or fall back to
`choice.text` for legacy completions API responses).
- Backwards-compatibility shim for `req.onToken`: if a caller
registered a token callback (legacy chat-style streaming UX),
fire it ONCE with the full content at the end. The current
orchestrator + queue model never consumes per-token streams for
remote tiers, so this is a degraded-but-equivalent path. The
playground module uses its own client and isn't affected.
Verified manually with curl:
$ curl -X POST https://llm.mana.how/v1/chat/completions \
-H 'Content-Type: application/json' \
-d '{"model":"gemma3:4b","messages":[{"role":"user","content":"Hi"}],"max_tokens":50,"stream":false}'
→ returns clean JSON with `choices[0].message.content` populated.
Same call with `stream: true` from the same host also works (full
SSE frames come back). The bug really is browser+credentials
specific, not a service bug.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
143 lines
5.1 KiB
TypeScript
143 lines
5.1 KiB
TypeScript
/**
|
|
* Shared HTTP transport for the mana-server and cloud backends.
|
|
*
|
|
* Both tiers POST to the same OpenAI-compatible endpoint on
|
|
* services/mana-llm — they only differ in the `model:` string they
|
|
* send (which selects which provider mana-llm internally routes to).
|
|
*
|
|
* The endpoint is `/v1/chat/completions` and the wire format is
|
|
* straight OpenAI SSE: `data: {…}\n\n` lines, terminated by
|
|
* `data: [DONE]`. The hand-rolled parser is the same shape as the
|
|
* existing playground client (apps/mana/apps/web/src/lib/modules/
|
|
* playground/llm.ts) so the two consumers stay aligned and can be
|
|
* unified later if we want.
|
|
*/
|
|
|
|
import { BackendUnreachableError, ProviderBlockedError } from '../errors';
|
|
import type { LlmTier } from '../tiers';
|
|
import type { GenerateResult, LlmTaskRequest } from '../types';
|
|
|
|
const DEFAULT_LLM_URL = 'http://localhost:3025';
|
|
|
|
/** Resolve the mana-llm base URL from the window-injected env, falling
|
|
* back to localhost. Mirrors the playground client pattern. */
|
|
export function resolveLlmBaseUrl(): string {
|
|
if (typeof window !== 'undefined') {
|
|
const fromWindow = (window as unknown as { __PUBLIC_MANA_LLM_URL__?: string })
|
|
.__PUBLIC_MANA_LLM_URL__;
|
|
if (fromWindow) return fromWindow.replace(/\/$/, '');
|
|
}
|
|
return DEFAULT_LLM_URL;
|
|
}
|
|
|
|
/**
|
|
* Send a chat completion to mana-llm and return the result.
|
|
*
|
|
* Implementation notes:
|
|
*
|
|
* - We use the NON-streaming endpoint (`stream: false`). Curl tests
|
|
* from the same hostname showed that mana-llm's streaming endpoint
|
|
* works perfectly when called from outside the browser, but the
|
|
* browser receives `totalFrames=0` (an empty response body) for
|
|
* reasons that almost certainly trace back to CORS + credentials
|
|
* + streaming-body interactions. Non-streaming is a single JSON
|
|
* response, much friendlier to the browser fetch API.
|
|
*
|
|
* - We do NOT pass `credentials: 'include'`. The mana-llm service
|
|
* doesn't require user auth (the API key middleware accepts
|
|
* anonymous requests), and `credentials: 'include'` plus
|
|
* `Access-Control-Allow-Origin: *` is one of the patterns that
|
|
* silently breaks the response body in browsers. Verified by
|
|
* comparing curl-from-server (no creds, works) vs browser fetch
|
|
* (with creds, empty body).
|
|
*
|
|
* - For tasks that registered an `onToken` callback (legacy chat-
|
|
* style streaming UX), we fire it ONCE with the full content at
|
|
* the end. That's a degraded streaming experience, but no current
|
|
* shared-llm caller actually consumes the per-token stream — the
|
|
* queue + watcher model only cares about the final result. The
|
|
* playground module uses its own client (apps/.../modules/
|
|
* playground/llm.ts) which keeps real streaming for live UX.
|
|
*
|
|
* `tier` is only used for error tagging — both 'mana-server' and
|
|
* 'cloud' call the same endpoint with different model strings.
|
|
*/
|
|
export async function callManaLlmStreaming(
|
|
tier: Exclude<LlmTier, 'none' | 'browser'>,
|
|
model: string,
|
|
req: LlmTaskRequest
|
|
): Promise<GenerateResult> {
|
|
const url = `${resolveLlmBaseUrl()}/v1/chat/completions`;
|
|
const start = performance.now();
|
|
|
|
let res: Response;
|
|
try {
|
|
res = await fetch(url, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
model,
|
|
messages: req.messages,
|
|
temperature: req.temperature ?? 0.7,
|
|
max_tokens: req.maxTokens ?? 1024,
|
|
stream: false,
|
|
}),
|
|
});
|
|
} catch (err) {
|
|
// Network failure — DNS, refused connection, CORS preflight, etc.
|
|
throw new BackendUnreachableError(
|
|
tier,
|
|
undefined,
|
|
err instanceof Error ? err.message : String(err)
|
|
);
|
|
}
|
|
|
|
if (!res.ok) {
|
|
const text = await res.text().catch(() => '');
|
|
// 451 = upstream blocked content (we use this convention; Gemini
|
|
// safety blocks are mapped to 451 in mana-llm's google provider).
|
|
if (res.status === 451 || /safety|blocked|filter/i.test(text)) {
|
|
throw new ProviderBlockedError(tier, text || `HTTP ${res.status}`);
|
|
}
|
|
throw new BackendUnreachableError(tier, res.status, text);
|
|
}
|
|
|
|
let json: {
|
|
choices?: Array<{
|
|
message?: { content?: string };
|
|
text?: string;
|
|
}>;
|
|
usage?: { prompt_tokens?: number; completion_tokens?: number };
|
|
};
|
|
try {
|
|
json = await res.json();
|
|
} catch (err) {
|
|
console.warn(`[shared-llm:${tier}] failed to parse response JSON`, err);
|
|
throw new BackendUnreachableError(tier, res.status, 'invalid JSON response');
|
|
}
|
|
|
|
const choice = json.choices?.[0];
|
|
const content = choice?.message?.content ?? choice?.text ?? '';
|
|
|
|
if (!content) {
|
|
console.warn(`[shared-llm:${tier}] empty completion content`, { model, json });
|
|
}
|
|
|
|
// One-shot "streaming" for any caller that registered onToken: emit
|
|
// the whole content as a single chunk at the end. The current
|
|
// orchestrator + queue model never reads tokens incrementally for
|
|
// remote tiers anyway.
|
|
if (content && req.onToken) {
|
|
req.onToken(content);
|
|
}
|
|
|
|
return {
|
|
content,
|
|
usage: {
|
|
promptTokens: json.usage?.prompt_tokens ?? 0,
|
|
completionTokens: json.usage?.completion_tokens ?? 0,
|
|
totalTokens: (json.usage?.prompt_tokens ?? 0) + (json.usage?.completion_tokens ?? 0),
|
|
},
|
|
latencyMs: Math.round(performance.now() - start),
|
|
};
|
|
}
|