From 56ffcbac39c33ba2566facdb9404b465f1c2740c Mon Sep 17 00:00:00 2001
From: Till JS <tills95@gmail.com>
Date: Tue, 24 Mar 2026 09:41:33 +0100
Subject: [PATCH] feat: add Ollama memory optimization, LLM metrics, and chat
 streaming
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three improvements to the unified LLM infrastructure:

1. Ollama memory optimization (scripts/mac-mini/configure-ollama.sh):
   - OLLAMA_KEEP_ALIVE=5m → models unload after 5min idle (saves 3-16GB RAM)
   - OLLAMA_NUM_PARALLEL=1 → predictable memory usage
   - OLLAMA_MAX_LOADED_MODELS=1 → max 1 model in RAM at a time

2. Request-level metrics in @manacore/shared-llm:
   - LlmRequestMetrics interface (model, latency, tokens, fallback detection)
   - LlmMetricsCollector class with summary stats (for health endpoints)
   - Optional onMetrics callback in LlmModuleOptions
   - Automatic metrics emission in chatMessages() (success + error)

3. Chat streaming (token-by-token SSE):
   - Backend: POST /chat/completions/stream SSE endpoint
   - OllamaService.createStreamingCompletion() via llm.chatStreamMessages()
   - ChatService.createStreamingCompletion() with upfront credit consumption
   - Web: chatApi.createStreamingCompletion() SSE consumer
   - Chat store: sendMessage() now streams tokens into assistant message
   - UI updates reactively as each token arrives

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../apps/backend/src/chat/chat.controller.ts  | 29 +++++-
 .../apps/backend/src/chat/chat.service.ts     | 62 +++++++++++++
 .../apps/backend/src/chat/ollama.service.ts   | 16 ++++
 apps/chat/apps/web/src/lib/services/api.ts    | 64 +++++++++++++
 apps/chat/apps/web/src/lib/services/chat.ts   | 15 +++-
 .../apps/web/src/lib/stores/chat.svelte.ts    | 53 +++++++----
 docs/MAC_MINI_SERVER.md                       |  5 ++
 packages/shared-llm/src/index.ts              |  4 +
 .../src/interfaces/llm-options.interface.ts   |  5 ++
 packages/shared-llm/src/llm-client.ts         | 58 ++++++++++--
 packages/shared-llm/src/utils/index.ts        |  2 +
 packages/shared-llm/src/utils/metrics.ts      | 88 ++++++++++++++++++
 scripts/mac-mini/configure-ollama.sh          | 90 +++++++++++++++++++
 13 files changed, 462 insertions(+), 29 deletions(-)
 create mode 100644 packages/shared-llm/src/utils/metrics.ts
 create mode 100755 scripts/mac-mini/configure-ollama.sh
diff --git a/apps/chat/apps/backend/src/chat/chat.controller.ts b/apps/chat/apps/backend/src/chat/chat.controller.ts
index 13b495613..852d0df6c 100644
--- a/apps/chat/apps/backend/src/chat/chat.controller.ts
+++ b/apps/chat/apps/backend/src/chat/chat.controller.ts
@@ -1,4 +1,5 @@
-import { Body, Controller, Get, Post, UseGuards } from '@nestjs/common';
+import { Body, Controller, Get, Post, Res, UseGuards } from '@nestjs/common';
+import type { Response } from 'express';
 import { isOk } from '@manacore/shared-errors';
 import { ChatService } from './chat.service';
 import { ChatCompletionDto } from './dto/chat-completion.dto';
@@ -24,9 +25,33 @@ export class ChatController {
 		const result = await this.chatService.createCompletion(dto, user.userId);
 
 		if (!isOk(result)) {
-			throw result.error; // Caught by AppExceptionFilter
+			throw result.error;
 		}
 
 		return result.value;
 	}
+
+	@Post('completions/stream')
+	async createStreamingCompletion(
+		@Body() dto: ChatCompletionDto,
+		@CurrentUser() user: CurrentUserData,
+		@Res() res: Response
+	): Promise<void> {
+		res.setHeader('Content-Type', 'text/event-stream');
+		res.setHeader('Cache-Control', 'no-cache');
+		res.setHeader('Connection', 'keep-alive');
+		res.setHeader('X-Accel-Buffering', 'no');
+
+		try {
+			for await (const token of this.chatService.createStreamingCompletion(dto, user.userId)) {
+				res.write(`data: ${JSON.stringify({ token })}\n\n`);
+			}
+			res.write('data: [DONE]\n\n');
+		} catch (error) {
+			const message = error instanceof Error ? error.message : 'Stream failed';
+			res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
+		} finally {
+			res.end();
+		}
+	}
 }
diff --git a/apps/chat/apps/backend/src/chat/chat.service.ts b/apps/chat/apps/backend/src/chat/chat.service.ts
index db22f1be7..5764cff7d 100644
--- a/apps/chat/apps/backend/src/chat/chat.service.ts
+++ b/apps/chat/apps/backend/src/chat/chat.service.ts
@@ -159,6 +159,68 @@ export class ChatService {
 		return params?.model || model.provider;
 	}
 
+	/**
+	 * Create a streaming completion. Yields text tokens as they arrive.
+	 * Credits are consumed upfront (estimated cost) since we don't know final token count.
+	 */
+	async *createStreamingCompletion(dto: ChatCompletionDto, userId?: string): AsyncIterable<string> {
+		const model = await this.getModelById(dto.modelId);
+		if (!model) {
+			throw new Error(`Model ${dto.modelId} not found`);
+		}
+
+		// Consume credits upfront for streaming
+		if (userId) {
+			const creditOperation = this.getCreditOperationForModel(model);
+			const creditCost = CREDIT_COSTS[creditOperation];
+
+			const validation = await this.creditClient.validateCredits(
+				userId,
+				creditOperation,
+				creditCost
+			);
+			if (!validation.hasCredits) {
+				throw new Error(
+					`Insufficient credits: need ${creditCost}, have ${validation.availableCredits}`
+				);
+			}
+
+			await this.creditClient.consumeCredits(
+				userId,
+				creditOperation,
+				creditCost,
+				`Chat stream with ${this.getModelDisplayName(model)}`,
+				{ modelId: dto.modelId, provider: model.provider, streaming: true }
+			);
+		}
+
+		const params = model.parameters as {
+			model?: string;
+			temperature?: number;
+			max_tokens?: number;
+		} | null;
+		const modelName = params?.model || 'gemma3:4b';
+		const prefixedModel =
+			model.provider === 'openrouter'
+				? modelName.includes('/')
+					? `openrouter/${modelName}`
+					: modelName
+				: modelName;
+
+		const temperature = dto.temperature ?? params?.temperature ?? 0.7;
+		const maxTokens = dto.maxTokens ?? params?.max_tokens ?? 4096;
+
+		yield* this.ollamaService.createStreamingCompletion(
+			prefixedModel,
+			dto.messages.map((msg) => ({
+				role: msg.role as 'system' | 'user' | 'assistant',
+				content: msg.content,
+			})),
+			temperature,
+			maxTokens
+		);
+	}
+
 	private async createOllamaCompletion(
 		model: Model,
 		dto: ChatCompletionDto
diff --git a/apps/chat/apps/backend/src/chat/ollama.service.ts b/apps/chat/apps/backend/src/chat/ollama.service.ts
index 0e6a4483c..a10e27a17 100644
--- a/apps/chat/apps/backend/src/chat/ollama.service.ts
+++ b/apps/chat/apps/backend/src/chat/ollama.service.ts
@@ -89,6 +89,22 @@ export class OllamaService {
 		}
 	}
 
+	async *createStreamingCompletion(
+		modelName: string,
+		messages: ChatMessage[],
+		temperature?: number,
+		maxTokens?: number
+	): AsyncIterable<string> {
+		const normalizedModel = modelName.includes('/') ? modelName : `ollama/${modelName}`;
+		this.logger.log(`Streaming request to mana-llm model: ${normalizedModel}`);
+
+		yield* this.llm.chatStreamMessages(messages, {
+			model: normalizedModel,
+			temperature,
+			maxTokens,
+		});
+	}
+
 	async listModels(): Promise<string[]> {
 		try {
 			const models = await this.llm.listModels();
diff --git a/apps/chat/apps/web/src/lib/services/api.ts b/apps/chat/apps/web/src/lib/services/api.ts
index 813268b33..94f10b5be 100644
--- a/apps/chat/apps/web/src/lib/services/api.ts
+++ b/apps/chat/apps/web/src/lib/services/api.ts
@@ -595,4 +595,68 @@ export const chatApi = {
 		}
 		return data;
 	},
+
+	/**
+	 * Create a streaming completion. Returns an async generator of text tokens.
+	 * Uses Server-Sent Events (SSE) for real-time token delivery.
+	 */
+	async *createStreamingCompletion(options: {
+		messages: ChatMessage[];
+		modelId: string;
+		temperature?: number;
+		maxTokens?: number;
+	}): AsyncGenerator<string> {
+		const authToken = await authStore.getValidToken();
+		if (!authToken) throw new Error('No authentication token');
+
+		const response = await fetch(`${API_BASE}/api/v1/chat/completions/stream`, {
+			method: 'POST',
+			headers: {
+				'Content-Type': 'application/json',
+				Authorization: `Bearer ${authToken}`,
+			},
+			body: JSON.stringify(options),
+		});
+
+		if (!response.ok) {
+			throw new Error(`Stream error: ${response.status}`);
+		}
+
+		if (!response.body) {
+			throw new Error('No response body for stream');
+		}
+
+		const reader = response.body.getReader();
+		const decoder = new TextDecoder();
+		let buffer = '';
+
+		try {
+			while (true) {
+				const { done, value } = await reader.read();
+				if (done) break;
+
+				buffer += decoder.decode(value, { stream: true });
+				const lines = buffer.split('\n');
+				buffer = lines.pop() ?? '';
+
+				for (const line of lines) {
+					const trimmed = line.trim();
+					if (!trimmed || !trimmed.startsWith('data: ')) continue;
+
+					const data = trimmed.slice(6);
+					if (data === '[DONE]') return;
+
+					try {
+						const parsed = JSON.parse(data);
+						if (parsed.error) throw new Error(parsed.error);
+						if (parsed.token) yield parsed.token;
+					} catch (e) {
+						if (e instanceof Error && e.message !== data) throw e;
+					}
+				}
+			}
+		} finally {
+			reader.releaseLock();
+		}
+	},
 };
diff --git a/apps/chat/apps/web/src/lib/services/chat.ts b/apps/chat/apps/web/src/lib/services/chat.ts
index 04227ba07..0ec7bca6c 100644
--- a/apps/chat/apps/web/src/lib/services/chat.ts
+++ b/apps/chat/apps/web/src/lib/services/chat.ts
@@ -28,7 +28,7 @@ export const chatService = {
 	},
 
 	/**
-	 * Send chat completion request
+	 * Send chat completion request (non-streaming)
 	 */
 	async createCompletion(request: ChatCompletionRequest): Promise<ChatCompletionResponse | null> {
 		return chatApi.createCompletion({
@@ -38,4 +38,17 @@ export const chatService = {
 			maxTokens: request.maxTokens ?? 1000,
 		});
 	},
+
+	/**
+	 * Send streaming chat completion request.
+	 * Returns an async generator that yields text tokens as they arrive.
+	 */
+	async *createStreamingCompletion(request: ChatCompletionRequest): AsyncGenerator<string> {
+		yield* chatApi.createStreamingCompletion({
+			messages: request.messages,
+			modelId: request.modelId,
+			temperature: request.temperature ?? 0.7,
+			maxTokens: request.maxTokens ?? 1000,
+		});
+	},
 };
diff --git a/apps/chat/apps/web/src/lib/stores/chat.svelte.ts b/apps/chat/apps/web/src/lib/stores/chat.svelte.ts
index bdbbb054f..e7d9be4c5 100644
--- a/apps/chat/apps/web/src/lib/stores/chat.svelte.ts
+++ b/apps/chat/apps/web/src/lib/stores/chat.svelte.ts
@@ -80,36 +80,53 @@ export const chatStore = {
 		};
 		messages = [...messages, userMessage];
 
+		// Add placeholder assistant message for streaming
+		const assistantId = `temp-${++messageCounter}`;
+		const assistantMessage: Message = {
+			id: assistantId,
+			conversationId: '',
+			sender: 'assistant',
+			messageText: '',
+			createdAt: new Date().toISOString(),
+		};
+		messages = [...messages, assistantMessage];
+
 		try {
-			// Build chat messages for API
-			const chatMessages: ChatMessage[] = messages.map((m) => ({
-				role: m.sender === 'user' ? 'user' : 'assistant',
-				content: m.messageText,
-			}));
+			const chatMessages: ChatMessage[] = messages
+				.filter((m) => m.id !== assistantId)
+				.map((m) => ({
+					role: m.sender === 'user' ? 'user' : 'assistant',
+					content: m.messageText,
+				}));
 
 			const request: ChatCompletionRequest = {
 				messages: chatMessages,
 				modelId: selectedModelId,
 			};
 
-			const response = await chatService.createCompletion(request);
+			// Stream tokens into the assistant message
+			let fullContent = '';
+			for await (const token of chatService.createStreamingCompletion(request)) {
+				fullContent += token;
+				// Update the assistant message reactively
+				messages = messages.map((m) =>
+					m.id === assistantId ? { ...m, messageText: fullContent } : m
+				);
+			}
 
-			if (response) {
-				// Add assistant message
-				const assistantMessage: Message = {
-					id: `temp-${++messageCounter}`,
-					conversationId: '',
-					sender: 'assistant',
-					messageText: response.content,
-					createdAt: new Date().toISOString(),
-				};
-				messages = [...messages, assistantMessage];
-				ChatEvents.messageSent(selectedModelId);
-			} else {
+			if (!fullContent) {
 				error = 'Failed to get response';
+				messages = messages.filter((m) => m.id !== assistantId);
+			} else {
+				ChatEvents.messageSent(selectedModelId);
 			}
 		} catch (e) {
 			error = e instanceof Error ? e.message : 'Failed to send message';
+			// Remove empty assistant message on error
+			const msg = messages.find((m) => m.id === assistantId);
+			if (msg && !msg.messageText) {
+				messages = messages.filter((m) => m.id !== assistantId);
+			}
 		} finally {
 			isSending = false;
 		}
diff --git a/docs/MAC_MINI_SERVER.md b/docs/MAC_MINI_SERVER.md
index f14a57c57..44ce27d18 100644
--- a/docs/MAC_MINI_SERVER.md
+++ b/docs/MAC_MINI_SERVER.md
@@ -594,8 +594,13 @@ Systemeinstellungen → Datenschutz & Sicherheit → Voller Festplattenzugriff 
 **LaunchAgent:** `~/Library/LaunchAgents/homebrew.mxcl.ollama.plist`
 
 Optimierungen bereits aktiviert:
+- `OLLAMA_KEEP_ALIVE=5m` - Modelle nach 5min Inaktivität aus RAM entladen (spart 3-16 GB)
 - `OLLAMA_FLASH_ATTENTION=1` - Schnellere Attention-Berechnung
 - `OLLAMA_KV_CACHE_TYPE=q8_0` - Effizienterer KV-Cache
+- `OLLAMA_NUM_PARALLEL=1` - Max 1 paralleler Request (vorhersagbarer RAM)
+- `OLLAMA_MAX_LOADED_MODELS=1` - Max 1 Modell gleichzeitig im RAM
+
+Setup-Script: `./scripts/mac-mini/configure-ollama.sh`
 
 ### Speicherort
 
diff --git a/packages/shared-llm/src/index.ts b/packages/shared-llm/src/index.ts
index d7f9de192..f0a95fe93 100644
--- a/packages/shared-llm/src/index.ts
+++ b/packages/shared-llm/src/index.ts
@@ -33,3 +33,7 @@ export type {
 
 // Utilities
 export { extractJson } from './utils';
+
+// Metrics
+export { LlmMetricsCollector } from './utils';
+export type { LlmRequestMetrics, MetricsCallback } from './utils';
diff --git a/packages/shared-llm/src/interfaces/llm-options.interface.ts b/packages/shared-llm/src/interfaces/llm-options.interface.ts
index def8b8463..75eb78737 100644
--- a/packages/shared-llm/src/interfaces/llm-options.interface.ts
+++ b/packages/shared-llm/src/interfaces/llm-options.interface.ts
@@ -1,4 +1,5 @@
 import type { ModuleMetadata, Type } from '@nestjs/common';
+import type { MetricsCallback } from '../utils/metrics';
 
 export interface LlmModuleOptions {
 	/** mana-llm service URL (default: http://localhost:3025) */
@@ -13,6 +14,8 @@ export interface LlmModuleOptions {
 	maxRetries?: number;
 	/** Enable debug logging (default: false) */
 	debug?: boolean;
+	/** Optional callback invoked after every LLM request with metrics */
+	onMetrics?: MetricsCallback;
 }
 
 export interface LlmModuleAsyncOptions extends Pick<ModuleMetadata, 'imports'> {
@@ -33,6 +36,7 @@ export interface ResolvedLlmOptions {
 	timeout: number;
 	maxRetries: number;
 	debug: boolean;
+	onMetrics?: MetricsCallback;
 }
 
 export function resolveOptions(options: LlmModuleOptions): ResolvedLlmOptions {
@@ -43,5 +47,6 @@ export function resolveOptions(options: LlmModuleOptions): ResolvedLlmOptions {
 		timeout: options.timeout ?? 120_000,
 		maxRetries: options.maxRetries ?? 2,
 		debug: options.debug ?? false,
+		onMetrics: options.onMetrics,
 	};
 }
diff --git a/packages/shared-llm/src/llm-client.ts b/packages/shared-llm/src/llm-client.ts
index 5ff6f2763..d4f6d794e 100644
--- a/packages/shared-llm/src/llm-client.ts
+++ b/packages/shared-llm/src/llm-client.ts
@@ -22,6 +22,7 @@ import type {
 	ChatCompletionResponse,
 	EmbeddingResponse,
 } from './types/openai-compat.types';
+import type { LlmRequestMetrics } from './utils/metrics';
 import { extractJson } from './utils/json-extractor';
 import { retryFetch } from './utils/retry';
 
@@ -52,17 +53,48 @@ export class LlmClient {
 
 	/** Chat with full message history. */
 	async chatMessages(messages: ChatMessage[], opts?: ChatOptions): Promise<ChatResult> {
+		const requestedModel = opts?.model ?? this.options.defaultModel;
 		const body = this.buildRequest(messages, opts, false);
 		const start = Date.now();
-		const response = await this.fetchCompletion(body, opts?.timeout);
-		const latencyMs = Date.now() - start;
 
-		return {
-			content: response.choices[0]?.message?.content ?? '',
-			model: response.model,
-			usage: response.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 },
-			latencyMs,
-		};
+		try {
+			const response = await this.fetchCompletion(body, opts?.timeout);
+			const latencyMs = Date.now() - start;
+			const usage = response.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
+
+			this.emitMetrics({
+				model: requestedModel,
+				actualModel: response.model,
+				type: 'chat',
+				latencyMs,
+				promptTokens: usage.prompt_tokens,
+				completionTokens: usage.completion_tokens,
+				totalTokens: usage.total_tokens,
+				wasFallback: response.model !== requestedModel && !response.model.endsWith(requestedModel),
+				success: true,
+			});
+
+			return {
+				content: response.choices[0]?.message?.content ?? '',
+				model: response.model,
+				usage,
+				latencyMs,
+			};
+		} catch (error) {
+			this.emitMetrics({
+				model: requestedModel,
+				actualModel: requestedModel,
+				type: 'chat',
+				latencyMs: Date.now() - start,
+				promptTokens: 0,
+				completionTokens: 0,
+				totalTokens: 0,
+				wasFallback: false,
+				success: false,
+				error: error instanceof Error ? error.message : String(error),
+			});
+			throw error;
+		}
 	}
 
 	// ---------------------------------------------------------------------------
@@ -347,4 +379,14 @@ export class LlmClient {
 
 		return (await response.json()) as ChatCompletionResponse;
 	}
+
+	private emitMetrics(metrics: LlmRequestMetrics): void {
+		if (this.options.onMetrics) {
+			try {
+				this.options.onMetrics(metrics);
+			} catch {
+				// Never let metrics callback break the request
+			}
+		}
+	}
 }
diff --git a/packages/shared-llm/src/utils/index.ts b/packages/shared-llm/src/utils/index.ts
index 1f345070f..1466b2de8 100644
--- a/packages/shared-llm/src/utils/index.ts
+++ b/packages/shared-llm/src/utils/index.ts
@@ -1,3 +1,5 @@
 export { extractJson } from './json-extractor';
 export { retryFetch } from './retry';
 export type { RetryOptions } from './retry';
+export { LlmMetricsCollector } from './metrics';
+export type { LlmRequestMetrics, MetricsCallback } from './metrics';
diff --git a/packages/shared-llm/src/utils/metrics.ts b/packages/shared-llm/src/utils/metrics.ts
new file mode 100644
index 000000000..3751994d3
--- /dev/null
+++ b/packages/shared-llm/src/utils/metrics.ts
@@ -0,0 +1,88 @@
+/**
+ * Request-level metrics for LLM calls.
+ *
+ * Provides an optional callback system that backends can hook into
+ * for monitoring, logging, or forwarding to Prometheus/Grafana.
+ */
+
+export interface LlmRequestMetrics {
+	/** Model requested (e.g. "ollama/gemma3:4b") */
+	model: string;
+	/** Model actually used (may differ if fallback occurred) */
+	actualModel: string;
+	/** Request type */
+	type: 'chat' | 'json' | 'vision' | 'visionJson' | 'embed' | 'stream';
+	/** Total request duration in ms */
+	latencyMs: number;
+	/** Token usage */
+	promptTokens: number;
+	completionTokens: number;
+	totalTokens: number;
+	/** Whether this request was a fallback (model differs from requested) */
+	wasFallback: boolean;
+	/** Whether the request succeeded */
+	success: boolean;
+	/** Error message if failed */
+	error?: string;
+}
+
+export type MetricsCallback = (metrics: LlmRequestMetrics) => void;
+
+/**
+ * Simple in-memory metrics aggregator.
+ * Useful for health endpoints and debugging.
+ */
+export class LlmMetricsCollector {
+	private _totalRequests = 0;
+	private _totalErrors = 0;
+	private _totalFallbacks = 0;
+	private _totalTokens = 0;
+	private _totalLatencyMs = 0;
+	private _byModel: Map<string, { requests: number; tokens: number; errors: number }> = new Map();
+
+	/** Use as MetricsCallback */
+	readonly collect = (metrics: LlmRequestMetrics): void => {
+		this._totalRequests++;
+		this._totalLatencyMs += metrics.latencyMs;
+		this._totalTokens += metrics.totalTokens;
+
+		if (!metrics.success) this._totalErrors++;
+		if (metrics.wasFallback) this._totalFallbacks++;
+
+		const modelKey = metrics.actualModel;
+		const existing = this._byModel.get(modelKey) ?? { requests: 0, tokens: 0, errors: 0 };
+		existing.requests++;
+		existing.tokens += metrics.totalTokens;
+		if (!metrics.success) existing.errors++;
+		this._byModel.set(modelKey, existing);
+	};
+
+	/** Get summary stats for health endpoints / dashboards */
+	getSummary() {
+		return {
+			totalRequests: this._totalRequests,
+			totalErrors: this._totalErrors,
+			totalFallbacks: this._totalFallbacks,
+			totalTokens: this._totalTokens,
+			averageLatencyMs:
+				this._totalRequests > 0 ? Math.round(this._totalLatencyMs / this._totalRequests) : 0,
+			fallbackRate:
+				this._totalRequests > 0
+					? Math.round((this._totalFallbacks / this._totalRequests) * 100)
+					: 0,
+			errorRate:
+				this._totalRequests > 0 ? Math.round((this._totalErrors / this._totalRequests) * 100) : 0,
+			byModel: Object.fromEntries(this._byModel),
+		};
+	}
+
+	/** Reset all counters */
+	reset(): void {
+		this._totalRequests = 0;
+		this._totalErrors = 0;
+		this._totalFallbacks = 0;
+		this._totalTokens = 0;
+		this._totalLatencyMs = 0;
+		this._byModel.clear();
+	}
+}
diff --git a/scripts/mac-mini/configure-ollama.sh b/scripts/mac-mini/configure-ollama.sh
new file mode 100755
index 000000000..33d12cfcd
--- /dev/null
+++ b/scripts/mac-mini/configure-ollama.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# Configure Ollama for optimal memory usage on Mac Mini
+#
+# Sets OLLAMA_KEEP_ALIVE=5m so models unload from RAM after 5 minutes
+# of inactivity. This is critical on the 16GB Mac Mini where Ollama
+# models can consume 3-16 GB RAM.
+#
+# Run on the Mac Mini:
+#   ./scripts/mac-mini/configure-ollama.sh
+
+set -e
+
+PLIST_DIR="$HOME/Library/LaunchAgents"
+OLLAMA_PLIST="$PLIST_DIR/homebrew.mxcl.ollama.plist"
+
+echo "=== Ollama Memory Optimization ==="
+echo ""
+
+# Check if Ollama is installed
+if ! command -v ollama &>/dev/null && [ ! -f /opt/homebrew/bin/ollama ]; then
+    echo "ERROR: Ollama not found. Install with: brew install ollama"
+    exit 1
+fi
+
+# Create override plist that sets environment variables
+# This is the recommended way to add env vars to a Homebrew service
+OVERRIDE_PLIST="$PLIST_DIR/com.manacore.ollama-env.plist"
+
+cat > "$OVERRIDE_PLIST" << 'PLIST'
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>Label</key>
+    <string>com.manacore.ollama-env</string>
+    <key>ProgramArguments</key>
+    <array>
+        <string>/bin/bash</string>
+        <string>-c</string>
+        <string>
+            # Set Ollama environment variables system-wide via launchctl
+            launchctl setenv OLLAMA_KEEP_ALIVE 5m
+            launchctl setenv OLLAMA_FLASH_ATTENTION 1
+            launchctl setenv OLLAMA_KV_CACHE_TYPE q8_0
+            launchctl setenv OLLAMA_NUM_PARALLEL 1
+            launchctl setenv OLLAMA_MAX_LOADED_MODELS 1
+        </string>
+    </array>
+    <key>RunAtLoad</key>
+    <true/>
+</dict>
+</plist>
+PLIST
+
+echo "Created: $OVERRIDE_PLIST"
+
+# Apply immediately (no reboot needed)
+launchctl setenv OLLAMA_KEEP_ALIVE 5m
+launchctl setenv OLLAMA_FLASH_ATTENTION 1
+launchctl setenv OLLAMA_KV_CACHE_TYPE q8_0
+launchctl setenv OLLAMA_NUM_PARALLEL 1
+launchctl setenv OLLAMA_MAX_LOADED_MODELS 1
+
+echo ""
+echo "Environment variables set:"
+echo "  OLLAMA_KEEP_ALIVE=5m        (unload models after 5min idle → saves 3-16GB RAM)"
+echo "  OLLAMA_FLASH_ATTENTION=1    (faster attention computation)"
+echo "  OLLAMA_KV_CACHE_TYPE=q8_0   (efficient KV cache)"
+echo "  OLLAMA_NUM_PARALLEL=1       (max 1 parallel request → predictable memory)"
+echo "  OLLAMA_MAX_LOADED_MODELS=1  (max 1 model in RAM at a time)"
+echo ""
+
+# Restart Ollama to pick up new settings
+echo "Restarting Ollama..."
+/opt/homebrew/bin/brew services restart ollama 2>/dev/null || {
+    echo "Homebrew restart failed, trying launchctl..."
+    launchctl stop homebrew.mxcl.ollama 2>/dev/null
+    sleep 2
+    launchctl start homebrew.mxcl.ollama 2>/dev/null
+}
+
+echo ""
+echo "Done! Verify with:"
+echo "  ollama ps                    # Should show no loaded models (or model with 5m timeout)"
+echo "  curl localhost:11434/api/ps  # Same via API"
+echo ""
+echo "Expected behavior:"
+echo "  - First request: ~2-5s cold start (model loads into RAM)"
+echo "  - Subsequent requests within 5min: instant (model in RAM)"
+echo "  - After 5min idle: model unloads, RAM freed"