feat: add Ollama memory optimization, LLM metrics, and chat streaming

Three improvements to the unified LLM infrastructure: 1. Ollama memory optimization (scripts/mac-mini/configure-ollama.sh): - OLLAMA_KEEP_ALIVE=5m → models unload after 5min idle (saves 3-16GB RAM) - OLLAMA_NUM_PARALLEL=1 → predictable memory usage - OLLAMA_MAX_LOADED_MODELS=1 → max 1 model in RAM at a time 2. Request-level metrics in @manacore/shared-llm: - LlmRequestMetrics interface (model, latency, tokens, fallback detection) - LlmMetricsCollector class with summary stats (for health endpoints) - Optional onMetrics callback in LlmModuleOptions - Automatic metrics emission in chatMessages() (success + error) 3. Chat streaming (token-by-token SSE): - Backend: POST /chat/completions/stream SSE endpoint - OllamaService.createStreamingCompletion() via llm.chatStreamMessages() - ChatService.createStreamingCompletion() with upfront credit consumption - Web: chatApi.createStreamingCompletion() SSE consumer - Chat store: sendMessage() now streams tokens into assistant message - UI updates reactively as each token arrives Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-23 15:26:42 +02:00 · 2026-03-24 09:41:33 +01:00 · 2026-03-24 09:41:33 +01:00 · 56ffcbac39
commit 56ffcbac39
parent ecda4535d8
13 changed files with 462 additions and 29 deletions
--- a/packages/shared-llm/src/utils/index.ts
+++ b/packages/shared-llm/src/utils/index.ts
@ -1,3 +1,5 @@
 export { extractJson } from './json-extractor';
 export { retryFetch } from './retry';
 export type { RetryOptions } from './retry';
+export { LlmMetricsCollector } from './metrics';
+export type { LlmRequestMetrics, MetricsCallback } from './metrics';
--- a/packages/shared-llm/src/utils/metrics.ts
+++ b/packages/shared-llm/src/utils/metrics.ts
@ -0,0 +1,88 @@
+/**
+ * Request-level metrics for LLM calls.
+ *
+ * Provides an optional callback system that backends can hook into
+ * for monitoring, logging, or forwarding to Prometheus/Grafana.
+ */
+
+export interface LlmRequestMetrics {
+	/** Model requested (e.g. "ollama/gemma3:4b") */
+	model: string;
+	/** Model actually used (may differ if fallback occurred) */
+	actualModel: string;
+	/** Request type */
+	type: 'chat' | 'json' | 'vision' | 'visionJson' | 'embed' | 'stream';
+	/** Total request duration in ms */
+	latencyMs: number;
+	/** Token usage */
+	promptTokens: number;
+	completionTokens: number;
+	totalTokens: number;
+	/** Whether this request was a fallback (model differs from requested) */
+	wasFallback: boolean;
+	/** Whether the request succeeded */
+	success: boolean;
+	/** Error message if failed */
+	error?: string;
+}
+
+export type MetricsCallback = (metrics: LlmRequestMetrics) => void;
+
+/**
+ * Simple in-memory metrics aggregator.
+ * Useful for health endpoints and debugging.
+ */
+export class LlmMetricsCollector {
+	private _totalRequests = 0;
+	private _totalErrors = 0;
+	private _totalFallbacks = 0;
+	private _totalTokens = 0;
+	private _totalLatencyMs = 0;
+	private _byModel: Map<string, { requests: number; tokens: number; errors: number }> = new Map();
+
+	/** Use as MetricsCallback */
+	readonly collect = (metrics: LlmRequestMetrics): void => {
+		this._totalRequests++;
+		this._totalLatencyMs += metrics.latencyMs;
+		this._totalTokens += metrics.totalTokens;
+
+		if (!metrics.success) this._totalErrors++;
+		if (metrics.wasFallback) this._totalFallbacks++;
+
+		const modelKey = metrics.actualModel;
+		const existing = this._byModel.get(modelKey) ?? { requests: 0, tokens: 0, errors: 0 };
+		existing.requests++;
+		existing.tokens += metrics.totalTokens;
+		if (!metrics.success) existing.errors++;
+		this._byModel.set(modelKey, existing);
+	};
+
+	/** Get summary stats for health endpoints / dashboards */
+	getSummary() {
+		return {
+			totalRequests: this._totalRequests,
+			totalErrors: this._totalErrors,
+			totalFallbacks: this._totalFallbacks,
+			totalTokens: this._totalTokens,
+			averageLatencyMs:
+				this._totalRequests > 0 ? Math.round(this._totalLatencyMs / this._totalRequests) : 0,
+			fallbackRate:
+				this._totalRequests > 0
+					? Math.round((this._totalFallbacks / this._totalRequests) * 100)
+					: 0,
+			errorRate:
+				this._totalRequests > 0 ? Math.round((this._totalErrors / this._totalRequests) * 100) : 0,
+			byModel: Object.fromEntries(this._byModel),
+		};
+	}
+
+	/** Reset all counters */
+	reset(): void {
+		this._totalRequests = 0;
+		this._totalErrors = 0;
+		this._totalFallbacks = 0;
+		this._totalTokens = 0;
+		this._totalLatencyMs = 0;
+		this._byModel.clear();
+	}
+}