managarten/packages/shared-llm/src/utils/metrics.ts
Till JS 56ffcbac39 feat: add Ollama memory optimization, LLM metrics, and chat streaming
Three improvements to the unified LLM infrastructure:

1. Ollama memory optimization (scripts/mac-mini/configure-ollama.sh):
   - OLLAMA_KEEP_ALIVE=5m → models unload after 5min idle (saves 3-16GB RAM)
   - OLLAMA_NUM_PARALLEL=1 → predictable memory usage
   - OLLAMA_MAX_LOADED_MODELS=1 → max 1 model in RAM at a time

2. Request-level metrics in @manacore/shared-llm:
   - LlmRequestMetrics interface (model, latency, tokens, fallback detection)
   - LlmMetricsCollector class with summary stats (for health endpoints)
   - Optional onMetrics callback in LlmModuleOptions
   - Automatic metrics emission in chatMessages() (success + error)

3. Chat streaming (token-by-token SSE):
   - Backend: POST /chat/completions/stream SSE endpoint
   - OllamaService.createStreamingCompletion() via llm.chatStreamMessages()
   - ChatService.createStreamingCompletion() with upfront credit consumption
   - Web: chatApi.createStreamingCompletion() SSE consumer
   - Chat store: sendMessage() now streams tokens into assistant message
   - UI updates reactively as each token arrives

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 09:41:33 +01:00

88 lines
2.6 KiB
TypeScript

/**
* Request-level metrics for LLM calls.
*
* Provides an optional callback system that backends can hook into
* for monitoring, logging, or forwarding to Prometheus/Grafana.
*/
export interface LlmRequestMetrics {
/** Model requested (e.g. "ollama/gemma3:4b") */
model: string;
/** Model actually used (may differ if fallback occurred) */
actualModel: string;
/** Request type */
type: 'chat' | 'json' | 'vision' | 'visionJson' | 'embed' | 'stream';
/** Total request duration in ms */
latencyMs: number;
/** Token usage */
promptTokens: number;
completionTokens: number;
totalTokens: number;
/** Whether this request was a fallback (model differs from requested) */
wasFallback: boolean;
/** Whether the request succeeded */
success: boolean;
/** Error message if failed */
error?: string;
}
export type MetricsCallback = (metrics: LlmRequestMetrics) => void;
/**
* Simple in-memory metrics aggregator.
* Useful for health endpoints and debugging.
*/
export class LlmMetricsCollector {
private _totalRequests = 0;
private _totalErrors = 0;
private _totalFallbacks = 0;
private _totalTokens = 0;
private _totalLatencyMs = 0;
private _byModel: Map<string, { requests: number; tokens: number; errors: number }> = new Map();
/** Use as MetricsCallback */
readonly collect = (metrics: LlmRequestMetrics): void => {
this._totalRequests++;
this._totalLatencyMs += metrics.latencyMs;
this._totalTokens += metrics.totalTokens;
if (!metrics.success) this._totalErrors++;
if (metrics.wasFallback) this._totalFallbacks++;
const modelKey = metrics.actualModel;
const existing = this._byModel.get(modelKey) ?? { requests: 0, tokens: 0, errors: 0 };
existing.requests++;
existing.tokens += metrics.totalTokens;
if (!metrics.success) existing.errors++;
this._byModel.set(modelKey, existing);
};
/** Get summary stats for health endpoints / dashboards */
getSummary() {
return {
totalRequests: this._totalRequests,
totalErrors: this._totalErrors,
totalFallbacks: this._totalFallbacks,
totalTokens: this._totalTokens,
averageLatencyMs:
this._totalRequests > 0 ? Math.round(this._totalLatencyMs / this._totalRequests) : 0,
fallbackRate:
this._totalRequests > 0
? Math.round((this._totalFallbacks / this._totalRequests) * 100)
: 0,
errorRate:
this._totalRequests > 0 ? Math.round((this._totalErrors / this._totalRequests) * 100) : 0,
byModel: Object.fromEntries(this._byModel),
};
}
/** Reset all counters */
reset(): void {
this._totalRequests = 0;
this._totalErrors = 0;
this._totalFallbacks = 0;
this._totalTokens = 0;
this._totalLatencyMs = 0;
this._byModel.clear();
}
}