mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-16 08:39:40 +02:00
Three improvements to the unified LLM infrastructure: 1. Ollama memory optimization (scripts/mac-mini/configure-ollama.sh): - OLLAMA_KEEP_ALIVE=5m → models unload after 5min idle (saves 3-16GB RAM) - OLLAMA_NUM_PARALLEL=1 → predictable memory usage - OLLAMA_MAX_LOADED_MODELS=1 → max 1 model in RAM at a time 2. Request-level metrics in @manacore/shared-llm: - LlmRequestMetrics interface (model, latency, tokens, fallback detection) - LlmMetricsCollector class with summary stats (for health endpoints) - Optional onMetrics callback in LlmModuleOptions - Automatic metrics emission in chatMessages() (success + error) 3. Chat streaming (token-by-token SSE): - Backend: POST /chat/completions/stream SSE endpoint - OllamaService.createStreamingCompletion() via llm.chatStreamMessages() - ChatService.createStreamingCompletion() with upfront credit consumption - Web: chatApi.createStreamingCompletion() SSE consumer - Chat store: sendMessage() now streams tokens into assistant message - UI updates reactively as each token arrives Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
392 lines
11 KiB
TypeScript
392 lines
11 KiB
TypeScript
/**
|
|
* Framework-agnostic LLM client that communicates with the mana-llm service.
|
|
*
|
|
* This is the core implementation shared between the NestJS LlmClientService
|
|
* and the standalone LlmClient export (for non-NestJS consumers like bot-services).
|
|
*/
|
|
|
|
import type { ResolvedLlmOptions } from './interfaces/llm-options.interface';
|
|
import type {
|
|
ChatMessage,
|
|
ChatOptions,
|
|
ChatResult,
|
|
JsonOptions,
|
|
JsonResult,
|
|
VisionOptions,
|
|
TokenUsage,
|
|
ModelInfo,
|
|
HealthStatus,
|
|
} from './types/chat.types';
|
|
import type {
|
|
ChatCompletionRequest,
|
|
ChatCompletionResponse,
|
|
EmbeddingResponse,
|
|
} from './types/openai-compat.types';
|
|
import type { LlmRequestMetrics } from './utils/metrics';
|
|
import { extractJson } from './utils/json-extractor';
|
|
import { retryFetch } from './utils/retry';
|
|
|
|
function createTimeoutSignal(ms: number): any {
|
|
const controller = new AbortController();
|
|
setTimeout(() => controller.abort(), ms);
|
|
return controller.signal;
|
|
}
|
|
|
|
export class LlmClient {
|
|
private readonly baseUrl: string;
|
|
private readonly options: ResolvedLlmOptions;
|
|
|
|
constructor(options: ResolvedLlmOptions) {
|
|
this.options = options;
|
|
this.baseUrl = options.manaLlmUrl.replace(/\/+$/, '');
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Text Chat
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Simple chat with a single prompt string. */
|
|
async chat(prompt: string, opts?: ChatOptions): Promise<ChatResult> {
|
|
const messages = this.buildMessages(prompt, opts?.systemPrompt);
|
|
return this.chatMessages(messages, opts);
|
|
}
|
|
|
|
/** Chat with full message history. */
|
|
async chatMessages(messages: ChatMessage[], opts?: ChatOptions): Promise<ChatResult> {
|
|
const requestedModel = opts?.model ?? this.options.defaultModel;
|
|
const body = this.buildRequest(messages, opts, false);
|
|
const start = Date.now();
|
|
|
|
try {
|
|
const response = await this.fetchCompletion(body, opts?.timeout);
|
|
const latencyMs = Date.now() - start;
|
|
const usage = response.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
|
|
|
|
this.emitMetrics({
|
|
model: requestedModel,
|
|
actualModel: response.model,
|
|
type: 'chat',
|
|
latencyMs,
|
|
promptTokens: usage.prompt_tokens,
|
|
completionTokens: usage.completion_tokens,
|
|
totalTokens: usage.total_tokens,
|
|
wasFallback: response.model !== requestedModel && !response.model.endsWith(requestedModel),
|
|
success: true,
|
|
});
|
|
|
|
return {
|
|
content: response.choices[0]?.message?.content ?? '',
|
|
model: response.model,
|
|
usage,
|
|
latencyMs,
|
|
};
|
|
} catch (error) {
|
|
this.emitMetrics({
|
|
model: requestedModel,
|
|
actualModel: requestedModel,
|
|
type: 'chat',
|
|
latencyMs: Date.now() - start,
|
|
promptTokens: 0,
|
|
completionTokens: 0,
|
|
totalTokens: 0,
|
|
wasFallback: false,
|
|
success: false,
|
|
error: error instanceof Error ? error.message : String(error),
|
|
});
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Streaming
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Streaming chat - returns an async iterable of text tokens. */
|
|
async *chatStream(prompt: string, opts?: ChatOptions): AsyncIterable<string> {
|
|
const messages = this.buildMessages(prompt, opts?.systemPrompt);
|
|
yield* this.chatStreamMessages(messages, opts);
|
|
}
|
|
|
|
/** Streaming chat with full message history. */
|
|
async *chatStreamMessages(messages: ChatMessage[], opts?: ChatOptions): AsyncIterable<string> {
|
|
const body = this.buildRequest(messages, opts, true);
|
|
const timeout = opts?.timeout ?? this.options.timeout;
|
|
|
|
const response = await retryFetch(
|
|
`${this.baseUrl}/v1/chat/completions`,
|
|
{
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(body),
|
|
signal: createTimeoutSignal(timeout),
|
|
},
|
|
{ maxRetries: this.options.maxRetries }
|
|
);
|
|
|
|
if (!response.ok) {
|
|
const text = await response.text().catch(() => '');
|
|
throw new Error(`mana-llm stream error ${response.status}: ${text}`);
|
|
}
|
|
|
|
if (!response.body) {
|
|
throw new Error('mana-llm returned no response body for stream');
|
|
}
|
|
|
|
const reader = response.body.getReader();
|
|
const decoder = new TextDecoder();
|
|
let buffer = '';
|
|
|
|
try {
|
|
while (true) {
|
|
const { done, value } = await reader.read();
|
|
if (done) break;
|
|
|
|
buffer += decoder.decode(value, { stream: true });
|
|
const lines = buffer.split('\n');
|
|
buffer = lines.pop() ?? '';
|
|
|
|
for (const line of lines) {
|
|
const trimmed = line.trim();
|
|
if (!trimmed || !trimmed.startsWith('data: ')) continue;
|
|
|
|
const data = trimmed.slice(6);
|
|
if (data === '[DONE]') return;
|
|
|
|
try {
|
|
const chunk = JSON.parse(data);
|
|
const content = chunk.choices?.[0]?.delta?.content;
|
|
if (content) yield content;
|
|
} catch {
|
|
// Skip unparseable chunks
|
|
}
|
|
}
|
|
}
|
|
} finally {
|
|
reader.releaseLock();
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Structured JSON Output
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Chat that extracts and parses JSON from the response. */
|
|
async json<T = unknown>(prompt: string, opts?: JsonOptions<T>): Promise<JsonResult<T>> {
|
|
const messages = this.buildMessages(prompt, opts?.systemPrompt);
|
|
return this.jsonMessages<T>(messages, opts);
|
|
}
|
|
|
|
/** JSON extraction from full message history. */
|
|
async jsonMessages<T = unknown>(
|
|
messages: ChatMessage[],
|
|
opts?: JsonOptions<T>
|
|
): Promise<JsonResult<T>> {
|
|
const maxAttempts = (opts?.jsonRetries ?? 1) + 1;
|
|
let lastError: Error | undefined;
|
|
|
|
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
const result = await this.chatMessages(messages, opts);
|
|
|
|
try {
|
|
const data = extractJson<T>(result.content, opts?.validate);
|
|
return { ...result, data };
|
|
} catch (error) {
|
|
lastError = error instanceof Error ? error : new Error(String(error));
|
|
if (this.options.debug) {
|
|
console.warn(
|
|
`[shared-llm] JSON extraction attempt ${attempt + 1}/${maxAttempts} failed:`,
|
|
lastError.message
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
throw lastError ?? new Error('JSON extraction failed');
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Vision
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Analyze an image with a text prompt. */
|
|
async vision(
|
|
prompt: string,
|
|
imageBase64: string,
|
|
mimeType?: string,
|
|
opts?: VisionOptions
|
|
): Promise<ChatResult> {
|
|
const messages = this.buildVisionMessages(prompt, imageBase64, mimeType, opts?.systemPrompt);
|
|
const model = opts?.visionModel ?? this.options.defaultVisionModel;
|
|
return this.chatMessages(messages, { ...opts, model });
|
|
}
|
|
|
|
/** Vision + JSON extraction. */
|
|
async visionJson<T = unknown>(
|
|
prompt: string,
|
|
imageBase64: string,
|
|
mimeType?: string,
|
|
opts?: VisionOptions & JsonOptions<T>
|
|
): Promise<JsonResult<T>> {
|
|
const messages = this.buildVisionMessages(prompt, imageBase64, mimeType, opts?.systemPrompt);
|
|
const model = opts?.visionModel ?? this.options.defaultVisionModel;
|
|
return this.jsonMessages<T>(messages, { ...opts, model });
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Embeddings
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Generate embeddings for text input. */
|
|
async embed(
|
|
input: string | string[],
|
|
model?: string
|
|
): Promise<{ embeddings: number[][]; usage: TokenUsage }> {
|
|
const response = await retryFetch(
|
|
`${this.baseUrl}/v1/embeddings`,
|
|
{
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
model: model ?? this.options.defaultModel,
|
|
input,
|
|
}),
|
|
signal: createTimeoutSignal(this.options.timeout),
|
|
},
|
|
{ maxRetries: this.options.maxRetries }
|
|
);
|
|
|
|
if (!response.ok) {
|
|
const text = await response.text().catch(() => '');
|
|
throw new Error(`mana-llm embeddings error ${response.status}: ${text}`);
|
|
}
|
|
|
|
const data = (await response.json()) as EmbeddingResponse;
|
|
return {
|
|
embeddings: data.data.map((d) => d.embedding),
|
|
usage: data.usage,
|
|
};
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Health & Models
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Check mana-llm health and provider status. */
|
|
async health(): Promise<HealthStatus> {
|
|
try {
|
|
const response = await fetch(`${this.baseUrl}/health`, {
|
|
signal: createTimeoutSignal(5_000),
|
|
});
|
|
if (!response.ok) {
|
|
return { status: 'unhealthy', providers: {} };
|
|
}
|
|
return (await response.json()) as HealthStatus;
|
|
} catch {
|
|
return { status: 'unhealthy', providers: {} };
|
|
}
|
|
}
|
|
|
|
/** List available models from all providers. */
|
|
async listModels(): Promise<ModelInfo[]> {
|
|
const response = await fetch(`${this.baseUrl}/v1/models`, {
|
|
signal: createTimeoutSignal(10_000),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`mana-llm models error ${response.status}`);
|
|
}
|
|
|
|
const data = (await response.json()) as { data: ModelInfo[] };
|
|
return data.data ?? [];
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Private helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
private buildMessages(prompt: string, systemPrompt?: string): ChatMessage[] {
|
|
const messages: ChatMessage[] = [];
|
|
if (systemPrompt) {
|
|
messages.push({ role: 'system', content: systemPrompt });
|
|
}
|
|
messages.push({ role: 'user', content: prompt });
|
|
return messages;
|
|
}
|
|
|
|
private buildVisionMessages(
|
|
prompt: string,
|
|
imageBase64: string,
|
|
mimeType?: string,
|
|
systemPrompt?: string
|
|
): ChatMessage[] {
|
|
const mime = mimeType ?? 'image/jpeg';
|
|
const dataUrl = imageBase64.startsWith('data:')
|
|
? imageBase64
|
|
: `data:${mime};base64,${imageBase64}`;
|
|
|
|
const messages: ChatMessage[] = [];
|
|
if (systemPrompt) {
|
|
messages.push({ role: 'system', content: systemPrompt });
|
|
}
|
|
messages.push({
|
|
role: 'user',
|
|
content: [
|
|
{ type: 'text', text: prompt },
|
|
{ type: 'image_url', image_url: { url: dataUrl } },
|
|
],
|
|
});
|
|
return messages;
|
|
}
|
|
|
|
private buildRequest(
|
|
messages: ChatMessage[],
|
|
opts: ChatOptions | undefined,
|
|
stream: boolean
|
|
): ChatCompletionRequest {
|
|
const request: ChatCompletionRequest = {
|
|
model: opts?.model ?? this.options.defaultModel,
|
|
messages,
|
|
stream,
|
|
};
|
|
|
|
if (opts?.temperature !== undefined) request.temperature = opts.temperature;
|
|
if (opts?.maxTokens !== undefined) request.max_tokens = opts.maxTokens;
|
|
|
|
return request;
|
|
}
|
|
|
|
private async fetchCompletion(
|
|
body: ChatCompletionRequest,
|
|
timeoutOverride?: number
|
|
): Promise<ChatCompletionResponse> {
|
|
const timeout = timeoutOverride ?? this.options.timeout;
|
|
|
|
const response = await retryFetch(
|
|
`${this.baseUrl}/v1/chat/completions`,
|
|
{
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(body),
|
|
signal: createTimeoutSignal(timeout),
|
|
},
|
|
{ maxRetries: this.options.maxRetries }
|
|
);
|
|
|
|
if (!response.ok) {
|
|
const text = await response.text().catch(() => '');
|
|
throw new Error(`mana-llm error ${response.status}: ${text}`);
|
|
}
|
|
|
|
return (await response.json()) as ChatCompletionResponse;
|
|
}
|
|
|
|
private emitMetrics(metrics: LlmRequestMetrics): void {
|
|
if (this.options.onMetrics) {
|
|
try {
|
|
this.options.onMetrics(metrics);
|
|
} catch {
|
|
// Never let metrics callback break the request
|
|
}
|
|
}
|
|
}
|
|
}
|