mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-18 07:49:41 +02:00
Three improvements to the unified LLM infrastructure: 1. Ollama memory optimization (scripts/mac-mini/configure-ollama.sh): - OLLAMA_KEEP_ALIVE=5m → models unload after 5min idle (saves 3-16GB RAM) - OLLAMA_NUM_PARALLEL=1 → predictable memory usage - OLLAMA_MAX_LOADED_MODELS=1 → max 1 model in RAM at a time 2. Request-level metrics in @manacore/shared-llm: - LlmRequestMetrics interface (model, latency, tokens, fallback detection) - LlmMetricsCollector class with summary stats (for health endpoints) - Optional onMetrics callback in LlmModuleOptions - Automatic metrics emission in chatMessages() (success + error) 3. Chat streaming (token-by-token SSE): - Backend: POST /chat/completions/stream SSE endpoint - OllamaService.createStreamingCompletion() via llm.chatStreamMessages() - ChatService.createStreamingCompletion() with upfront credit consumption - Web: chatApi.createStreamingCompletion() SSE consumer - Chat store: sendMessage() now streams tokens into assistant message - UI updates reactively as each token arrives Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
88 lines
2.6 KiB
TypeScript
88 lines
2.6 KiB
TypeScript
/**
|
|
* Request-level metrics for LLM calls.
|
|
*
|
|
* Provides an optional callback system that backends can hook into
|
|
* for monitoring, logging, or forwarding to Prometheus/Grafana.
|
|
*/
|
|
|
|
export interface LlmRequestMetrics {
|
|
/** Model requested (e.g. "ollama/gemma3:4b") */
|
|
model: string;
|
|
/** Model actually used (may differ if fallback occurred) */
|
|
actualModel: string;
|
|
/** Request type */
|
|
type: 'chat' | 'json' | 'vision' | 'visionJson' | 'embed' | 'stream';
|
|
/** Total request duration in ms */
|
|
latencyMs: number;
|
|
/** Token usage */
|
|
promptTokens: number;
|
|
completionTokens: number;
|
|
totalTokens: number;
|
|
/** Whether this request was a fallback (model differs from requested) */
|
|
wasFallback: boolean;
|
|
/** Whether the request succeeded */
|
|
success: boolean;
|
|
/** Error message if failed */
|
|
error?: string;
|
|
}
|
|
|
|
export type MetricsCallback = (metrics: LlmRequestMetrics) => void;
|
|
|
|
/**
|
|
* Simple in-memory metrics aggregator.
|
|
* Useful for health endpoints and debugging.
|
|
*/
|
|
export class LlmMetricsCollector {
|
|
private _totalRequests = 0;
|
|
private _totalErrors = 0;
|
|
private _totalFallbacks = 0;
|
|
private _totalTokens = 0;
|
|
private _totalLatencyMs = 0;
|
|
private _byModel: Map<string, { requests: number; tokens: number; errors: number }> = new Map();
|
|
|
|
/** Use as MetricsCallback */
|
|
readonly collect = (metrics: LlmRequestMetrics): void => {
|
|
this._totalRequests++;
|
|
this._totalLatencyMs += metrics.latencyMs;
|
|
this._totalTokens += metrics.totalTokens;
|
|
|
|
if (!metrics.success) this._totalErrors++;
|
|
if (metrics.wasFallback) this._totalFallbacks++;
|
|
|
|
const modelKey = metrics.actualModel;
|
|
const existing = this._byModel.get(modelKey) ?? { requests: 0, tokens: 0, errors: 0 };
|
|
existing.requests++;
|
|
existing.tokens += metrics.totalTokens;
|
|
if (!metrics.success) existing.errors++;
|
|
this._byModel.set(modelKey, existing);
|
|
};
|
|
|
|
/** Get summary stats for health endpoints / dashboards */
|
|
getSummary() {
|
|
return {
|
|
totalRequests: this._totalRequests,
|
|
totalErrors: this._totalErrors,
|
|
totalFallbacks: this._totalFallbacks,
|
|
totalTokens: this._totalTokens,
|
|
averageLatencyMs:
|
|
this._totalRequests > 0 ? Math.round(this._totalLatencyMs / this._totalRequests) : 0,
|
|
fallbackRate:
|
|
this._totalRequests > 0
|
|
? Math.round((this._totalFallbacks / this._totalRequests) * 100)
|
|
: 0,
|
|
errorRate:
|
|
this._totalRequests > 0 ? Math.round((this._totalErrors / this._totalRequests) * 100) : 0,
|
|
byModel: Object.fromEntries(this._byModel),
|
|
};
|
|
}
|
|
|
|
/** Reset all counters */
|
|
reset(): void {
|
|
this._totalRequests = 0;
|
|
this._totalErrors = 0;
|
|
this._totalFallbacks = 0;
|
|
this._totalTokens = 0;
|
|
this._totalLatencyMs = 0;
|
|
this._byModel.clear();
|
|
}
|
|
}
|