mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-20 01:01:25 +02:00
Three improvements to the unified LLM infrastructure: 1. Ollama memory optimization (scripts/mac-mini/configure-ollama.sh): - OLLAMA_KEEP_ALIVE=5m → models unload after 5min idle (saves 3-16GB RAM) - OLLAMA_NUM_PARALLEL=1 → predictable memory usage - OLLAMA_MAX_LOADED_MODELS=1 → max 1 model in RAM at a time 2. Request-level metrics in @manacore/shared-llm: - LlmRequestMetrics interface (model, latency, tokens, fallback detection) - LlmMetricsCollector class with summary stats (for health endpoints) - Optional onMetrics callback in LlmModuleOptions - Automatic metrics emission in chatMessages() (success + error) 3. Chat streaming (token-by-token SSE): - Backend: POST /chat/completions/stream SSE endpoint - OllamaService.createStreamingCompletion() via llm.chatStreamMessages() - ChatService.createStreamingCompletion() with upfront credit consumption - Web: chatApi.createStreamingCompletion() SSE consumer - Chat store: sendMessage() now streams tokens into assistant message - UI updates reactively as each token arrives Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
52 lines
1.7 KiB
TypeScript
52 lines
1.7 KiB
TypeScript
import type { ModuleMetadata, Type } from '@nestjs/common';
|
|
import type { MetricsCallback } from '../utils/metrics';
|
|
|
|
export interface LlmModuleOptions {
|
|
/** mana-llm service URL (default: http://localhost:3025) */
|
|
manaLlmUrl?: string;
|
|
/** Default text model (default: ollama/gemma3:4b) */
|
|
defaultModel?: string;
|
|
/** Default vision model (default: ollama/llava:7b) */
|
|
defaultVisionModel?: string;
|
|
/** Request timeout in ms (default: 120000) */
|
|
timeout?: number;
|
|
/** Max retries on transient failures (default: 2) */
|
|
maxRetries?: number;
|
|
/** Enable debug logging (default: false) */
|
|
debug?: boolean;
|
|
/** Optional callback invoked after every LLM request with metrics */
|
|
onMetrics?: MetricsCallback;
|
|
}
|
|
|
|
export interface LlmModuleAsyncOptions extends Pick<ModuleMetadata, 'imports'> {
|
|
useExisting?: Type<LlmOptionsFactory>;
|
|
useClass?: Type<LlmOptionsFactory>;
|
|
useFactory?: (...args: any[]) => Promise<LlmModuleOptions> | LlmModuleOptions;
|
|
inject?: any[];
|
|
}
|
|
|
|
export interface LlmOptionsFactory {
|
|
createLlmOptions(): Promise<LlmModuleOptions> | LlmModuleOptions;
|
|
}
|
|
|
|
export interface ResolvedLlmOptions {
|
|
manaLlmUrl: string;
|
|
defaultModel: string;
|
|
defaultVisionModel: string;
|
|
timeout: number;
|
|
maxRetries: number;
|
|
debug: boolean;
|
|
onMetrics?: MetricsCallback;
|
|
}
|
|
|
|
export function resolveOptions(options: LlmModuleOptions): ResolvedLlmOptions {
|
|
return {
|
|
manaLlmUrl: options.manaLlmUrl ?? 'http://localhost:3025',
|
|
defaultModel: options.defaultModel ?? 'ollama/gemma3:4b',
|
|
defaultVisionModel: options.defaultVisionModel ?? 'ollama/llava:7b',
|
|
timeout: options.timeout ?? 120_000,
|
|
maxRetries: options.maxRetries ?? 2,
|
|
debug: options.debug ?? false,
|
|
onMetrics: options.onMetrics,
|
|
};
|
|
}
|