From 56ffcbac39c33ba2566facdb9404b465f1c2740c Mon Sep 17 00:00:00 2001 From: Till JS Date: Tue, 24 Mar 2026 09:41:33 +0100 Subject: [PATCH] feat: add Ollama memory optimization, LLM metrics, and chat streaming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three improvements to the unified LLM infrastructure: 1. Ollama memory optimization (scripts/mac-mini/configure-ollama.sh): - OLLAMA_KEEP_ALIVE=5m → models unload after 5min idle (saves 3-16GB RAM) - OLLAMA_NUM_PARALLEL=1 → predictable memory usage - OLLAMA_MAX_LOADED_MODELS=1 → max 1 model in RAM at a time 2. Request-level metrics in @manacore/shared-llm: - LlmRequestMetrics interface (model, latency, tokens, fallback detection) - LlmMetricsCollector class with summary stats (for health endpoints) - Optional onMetrics callback in LlmModuleOptions - Automatic metrics emission in chatMessages() (success + error) 3. Chat streaming (token-by-token SSE): - Backend: POST /chat/completions/stream SSE endpoint - OllamaService.createStreamingCompletion() via llm.chatStreamMessages() - ChatService.createStreamingCompletion() with upfront credit consumption - Web: chatApi.createStreamingCompletion() SSE consumer - Chat store: sendMessage() now streams tokens into assistant message - UI updates reactively as each token arrives Co-Authored-By: Claude Opus 4.6 (1M context) --- .../apps/backend/src/chat/chat.controller.ts | 29 +++++- .../apps/backend/src/chat/chat.service.ts | 62 +++++++++++++ .../apps/backend/src/chat/ollama.service.ts | 16 ++++ apps/chat/apps/web/src/lib/services/api.ts | 64 +++++++++++++ apps/chat/apps/web/src/lib/services/chat.ts | 15 +++- .../apps/web/src/lib/stores/chat.svelte.ts | 53 +++++++---- docs/MAC_MINI_SERVER.md | 5 ++ packages/shared-llm/src/index.ts | 4 + .../src/interfaces/llm-options.interface.ts | 5 ++ packages/shared-llm/src/llm-client.ts | 58 ++++++++++-- packages/shared-llm/src/utils/index.ts | 2 + packages/shared-llm/src/utils/metrics.ts | 88 ++++++++++++++++++ scripts/mac-mini/configure-ollama.sh | 90 +++++++++++++++++++ 13 files changed, 462 insertions(+), 29 deletions(-) create mode 100644 packages/shared-llm/src/utils/metrics.ts create mode 100755 scripts/mac-mini/configure-ollama.sh diff --git a/apps/chat/apps/backend/src/chat/chat.controller.ts b/apps/chat/apps/backend/src/chat/chat.controller.ts index 13b495613..852d0df6c 100644 --- a/apps/chat/apps/backend/src/chat/chat.controller.ts +++ b/apps/chat/apps/backend/src/chat/chat.controller.ts @@ -1,4 +1,5 @@ -import { Body, Controller, Get, Post, UseGuards } from '@nestjs/common'; +import { Body, Controller, Get, Post, Res, UseGuards } from '@nestjs/common'; +import type { Response } from 'express'; import { isOk } from '@manacore/shared-errors'; import { ChatService } from './chat.service'; import { ChatCompletionDto } from './dto/chat-completion.dto'; @@ -24,9 +25,33 @@ export class ChatController { const result = await this.chatService.createCompletion(dto, user.userId); if (!isOk(result)) { - throw result.error; // Caught by AppExceptionFilter + throw result.error; } return result.value; } + + @Post('completions/stream') + async createStreamingCompletion( + @Body() dto: ChatCompletionDto, + @CurrentUser() user: CurrentUserData, + @Res() res: Response + ): Promise { + res.setHeader('Content-Type', 'text/event-stream'); + res.setHeader('Cache-Control', 'no-cache'); + res.setHeader('Connection', 'keep-alive'); + res.setHeader('X-Accel-Buffering', 'no'); + + try { + for await (const token of this.chatService.createStreamingCompletion(dto, user.userId)) { + res.write(`data: ${JSON.stringify({ token })}\n\n`); + } + res.write('data: [DONE]\n\n'); + } catch (error) { + const message = error instanceof Error ? error.message : 'Stream failed'; + res.write(`data: ${JSON.stringify({ error: message })}\n\n`); + } finally { + res.end(); + } + } } diff --git a/apps/chat/apps/backend/src/chat/chat.service.ts b/apps/chat/apps/backend/src/chat/chat.service.ts index db22f1be7..5764cff7d 100644 --- a/apps/chat/apps/backend/src/chat/chat.service.ts +++ b/apps/chat/apps/backend/src/chat/chat.service.ts @@ -159,6 +159,68 @@ export class ChatService { return params?.model || model.provider; } + /** + * Create a streaming completion. Yields text tokens as they arrive. + * Credits are consumed upfront (estimated cost) since we don't know final token count. + */ + async *createStreamingCompletion(dto: ChatCompletionDto, userId?: string): AsyncIterable { + const model = await this.getModelById(dto.modelId); + if (!model) { + throw new Error(`Model ${dto.modelId} not found`); + } + + // Consume credits upfront for streaming + if (userId) { + const creditOperation = this.getCreditOperationForModel(model); + const creditCost = CREDIT_COSTS[creditOperation]; + + const validation = await this.creditClient.validateCredits( + userId, + creditOperation, + creditCost + ); + if (!validation.hasCredits) { + throw new Error( + `Insufficient credits: need ${creditCost}, have ${validation.availableCredits}` + ); + } + + await this.creditClient.consumeCredits( + userId, + creditOperation, + creditCost, + `Chat stream with ${this.getModelDisplayName(model)}`, + { modelId: dto.modelId, provider: model.provider, streaming: true } + ); + } + + const params = model.parameters as { + model?: string; + temperature?: number; + max_tokens?: number; + } | null; + const modelName = params?.model || 'gemma3:4b'; + const prefixedModel = + model.provider === 'openrouter' + ? modelName.includes('/') + ? `openrouter/${modelName}` + : modelName + : modelName; + + const temperature = dto.temperature ?? params?.temperature ?? 0.7; + const maxTokens = dto.maxTokens ?? params?.max_tokens ?? 4096; + + yield* this.ollamaService.createStreamingCompletion( + prefixedModel, + dto.messages.map((msg) => ({ + role: msg.role as 'system' | 'user' | 'assistant', + content: msg.content, + })), + temperature, + maxTokens + ); + } + private async createOllamaCompletion( model: Model, dto: ChatCompletionDto diff --git a/apps/chat/apps/backend/src/chat/ollama.service.ts b/apps/chat/apps/backend/src/chat/ollama.service.ts index 0e6a4483c..a10e27a17 100644 --- a/apps/chat/apps/backend/src/chat/ollama.service.ts +++ b/apps/chat/apps/backend/src/chat/ollama.service.ts @@ -89,6 +89,22 @@ export class OllamaService { } } + async *createStreamingCompletion( + modelName: string, + messages: ChatMessage[], + temperature?: number, + maxTokens?: number + ): AsyncIterable { + const normalizedModel = modelName.includes('/') ? modelName : `ollama/${modelName}`; + this.logger.log(`Streaming request to mana-llm model: ${normalizedModel}`); + + yield* this.llm.chatStreamMessages(messages, { + model: normalizedModel, + temperature, + maxTokens, + }); + } + async listModels(): Promise { try { const models = await this.llm.listModels(); diff --git a/apps/chat/apps/web/src/lib/services/api.ts b/apps/chat/apps/web/src/lib/services/api.ts index 813268b33..94f10b5be 100644 --- a/apps/chat/apps/web/src/lib/services/api.ts +++ b/apps/chat/apps/web/src/lib/services/api.ts @@ -595,4 +595,68 @@ export const chatApi = { } return data; }, + + /** + * Create a streaming completion. Returns an async generator of text tokens. + * Uses Server-Sent Events (SSE) for real-time token delivery. + */ + async *createStreamingCompletion(options: { + messages: ChatMessage[]; + modelId: string; + temperature?: number; + maxTokens?: number; + }): AsyncGenerator { + const authToken = await authStore.getValidToken(); + if (!authToken) throw new Error('No authentication token'); + + const response = await fetch(`${API_BASE}/api/v1/chat/completions/stream`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${authToken}`, + }, + body: JSON.stringify(options), + }); + + if (!response.ok) { + throw new Error(`Stream error: ${response.status}`); + } + + if (!response.body) { + throw new Error('No response body for stream'); + } + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ''; + + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split('\n'); + buffer = lines.pop() ?? ''; + + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed || !trimmed.startsWith('data: ')) continue; + + const data = trimmed.slice(6); + if (data === '[DONE]') return; + + try { + const parsed = JSON.parse(data); + if (parsed.error) throw new Error(parsed.error); + if (parsed.token) yield parsed.token; + } catch (e) { + if (e instanceof Error && e.message !== data) throw e; + } + } + } + } finally { + reader.releaseLock(); + } + }, }; diff --git a/apps/chat/apps/web/src/lib/services/chat.ts b/apps/chat/apps/web/src/lib/services/chat.ts index 04227ba07..0ec7bca6c 100644 --- a/apps/chat/apps/web/src/lib/services/chat.ts +++ b/apps/chat/apps/web/src/lib/services/chat.ts @@ -28,7 +28,7 @@ export const chatService = { }, /** - * Send chat completion request + * Send chat completion request (non-streaming) */ async createCompletion(request: ChatCompletionRequest): Promise { return chatApi.createCompletion({ @@ -38,4 +38,17 @@ export const chatService = { maxTokens: request.maxTokens ?? 1000, }); }, + + /** + * Send streaming chat completion request. + * Returns an async generator that yields text tokens as they arrive. + */ + async *createStreamingCompletion(request: ChatCompletionRequest): AsyncGenerator { + yield* chatApi.createStreamingCompletion({ + messages: request.messages, + modelId: request.modelId, + temperature: request.temperature ?? 0.7, + maxTokens: request.maxTokens ?? 1000, + }); + }, }; diff --git a/apps/chat/apps/web/src/lib/stores/chat.svelte.ts b/apps/chat/apps/web/src/lib/stores/chat.svelte.ts index bdbbb054f..e7d9be4c5 100644 --- a/apps/chat/apps/web/src/lib/stores/chat.svelte.ts +++ b/apps/chat/apps/web/src/lib/stores/chat.svelte.ts @@ -80,36 +80,53 @@ export const chatStore = { }; messages = [...messages, userMessage]; + // Add placeholder assistant message for streaming + const assistantId = `temp-${++messageCounter}`; + const assistantMessage: Message = { + id: assistantId, + conversationId: '', + sender: 'assistant', + messageText: '', + createdAt: new Date().toISOString(), + }; + messages = [...messages, assistantMessage]; + try { - // Build chat messages for API - const chatMessages: ChatMessage[] = messages.map((m) => ({ - role: m.sender === 'user' ? 'user' : 'assistant', - content: m.messageText, - })); + const chatMessages: ChatMessage[] = messages + .filter((m) => m.id !== assistantId) + .map((m) => ({ + role: m.sender === 'user' ? 'user' : 'assistant', + content: m.messageText, + })); const request: ChatCompletionRequest = { messages: chatMessages, modelId: selectedModelId, }; - const response = await chatService.createCompletion(request); + // Stream tokens into the assistant message + let fullContent = ''; + for await (const token of chatService.createStreamingCompletion(request)) { + fullContent += token; + // Update the assistant message reactively + messages = messages.map((m) => + m.id === assistantId ? { ...m, messageText: fullContent } : m + ); + } - if (response) { - // Add assistant message - const assistantMessage: Message = { - id: `temp-${++messageCounter}`, - conversationId: '', - sender: 'assistant', - messageText: response.content, - createdAt: new Date().toISOString(), - }; - messages = [...messages, assistantMessage]; - ChatEvents.messageSent(selectedModelId); - } else { + if (!fullContent) { error = 'Failed to get response'; + messages = messages.filter((m) => m.id !== assistantId); + } else { + ChatEvents.messageSent(selectedModelId); } } catch (e) { error = e instanceof Error ? e.message : 'Failed to send message'; + // Remove empty assistant message on error + const msg = messages.find((m) => m.id === assistantId); + if (msg && !msg.messageText) { + messages = messages.filter((m) => m.id !== assistantId); + } } finally { isSending = false; } diff --git a/docs/MAC_MINI_SERVER.md b/docs/MAC_MINI_SERVER.md index f14a57c57..44ce27d18 100644 --- a/docs/MAC_MINI_SERVER.md +++ b/docs/MAC_MINI_SERVER.md @@ -594,8 +594,13 @@ Systemeinstellungen → Datenschutz & Sicherheit → Voller Festplattenzugriff **LaunchAgent:** `~/Library/LaunchAgents/homebrew.mxcl.ollama.plist` Optimierungen bereits aktiviert: +- `OLLAMA_KEEP_ALIVE=5m` - Modelle nach 5min Inaktivität aus RAM entladen (spart 3-16 GB) - `OLLAMA_FLASH_ATTENTION=1` - Schnellere Attention-Berechnung - `OLLAMA_KV_CACHE_TYPE=q8_0` - Effizienterer KV-Cache +- `OLLAMA_NUM_PARALLEL=1` - Max 1 paralleler Request (vorhersagbarer RAM) +- `OLLAMA_MAX_LOADED_MODELS=1` - Max 1 Modell gleichzeitig im RAM + +Setup-Script: `./scripts/mac-mini/configure-ollama.sh` ### Speicherort diff --git a/packages/shared-llm/src/index.ts b/packages/shared-llm/src/index.ts index d7f9de192..f0a95fe93 100644 --- a/packages/shared-llm/src/index.ts +++ b/packages/shared-llm/src/index.ts @@ -33,3 +33,7 @@ export type { // Utilities export { extractJson } from './utils'; + +// Metrics +export { LlmMetricsCollector } from './utils'; +export type { LlmRequestMetrics, MetricsCallback } from './utils'; diff --git a/packages/shared-llm/src/interfaces/llm-options.interface.ts b/packages/shared-llm/src/interfaces/llm-options.interface.ts index def8b8463..75eb78737 100644 --- a/packages/shared-llm/src/interfaces/llm-options.interface.ts +++ b/packages/shared-llm/src/interfaces/llm-options.interface.ts @@ -1,4 +1,5 @@ import type { ModuleMetadata, Type } from '@nestjs/common'; +import type { MetricsCallback } from '../utils/metrics'; export interface LlmModuleOptions { /** mana-llm service URL (default: http://localhost:3025) */ @@ -13,6 +14,8 @@ export interface LlmModuleOptions { maxRetries?: number; /** Enable debug logging (default: false) */ debug?: boolean; + /** Optional callback invoked after every LLM request with metrics */ + onMetrics?: MetricsCallback; } export interface LlmModuleAsyncOptions extends Pick { @@ -33,6 +36,7 @@ export interface ResolvedLlmOptions { timeout: number; maxRetries: number; debug: boolean; + onMetrics?: MetricsCallback; } export function resolveOptions(options: LlmModuleOptions): ResolvedLlmOptions { @@ -43,5 +47,6 @@ export function resolveOptions(options: LlmModuleOptions): ResolvedLlmOptions { timeout: options.timeout ?? 120_000, maxRetries: options.maxRetries ?? 2, debug: options.debug ?? false, + onMetrics: options.onMetrics, }; } diff --git a/packages/shared-llm/src/llm-client.ts b/packages/shared-llm/src/llm-client.ts index 5ff6f2763..d4f6d794e 100644 --- a/packages/shared-llm/src/llm-client.ts +++ b/packages/shared-llm/src/llm-client.ts @@ -22,6 +22,7 @@ import type { ChatCompletionResponse, EmbeddingResponse, } from './types/openai-compat.types'; +import type { LlmRequestMetrics } from './utils/metrics'; import { extractJson } from './utils/json-extractor'; import { retryFetch } from './utils/retry'; @@ -52,17 +53,48 @@ export class LlmClient { /** Chat with full message history. */ async chatMessages(messages: ChatMessage[], opts?: ChatOptions): Promise { + const requestedModel = opts?.model ?? this.options.defaultModel; const body = this.buildRequest(messages, opts, false); const start = Date.now(); - const response = await this.fetchCompletion(body, opts?.timeout); - const latencyMs = Date.now() - start; - return { - content: response.choices[0]?.message?.content ?? '', - model: response.model, - usage: response.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }, - latencyMs, - }; + try { + const response = await this.fetchCompletion(body, opts?.timeout); + const latencyMs = Date.now() - start; + const usage = response.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }; + + this.emitMetrics({ + model: requestedModel, + actualModel: response.model, + type: 'chat', + latencyMs, + promptTokens: usage.prompt_tokens, + completionTokens: usage.completion_tokens, + totalTokens: usage.total_tokens, + wasFallback: response.model !== requestedModel && !response.model.endsWith(requestedModel), + success: true, + }); + + return { + content: response.choices[0]?.message?.content ?? '', + model: response.model, + usage, + latencyMs, + }; + } catch (error) { + this.emitMetrics({ + model: requestedModel, + actualModel: requestedModel, + type: 'chat', + latencyMs: Date.now() - start, + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + wasFallback: false, + success: false, + error: error instanceof Error ? error.message : String(error), + }); + throw error; + } } // --------------------------------------------------------------------------- @@ -347,4 +379,14 @@ export class LlmClient { return (await response.json()) as ChatCompletionResponse; } + + private emitMetrics(metrics: LlmRequestMetrics): void { + if (this.options.onMetrics) { + try { + this.options.onMetrics(metrics); + } catch { + // Never let metrics callback break the request + } + } + } } diff --git a/packages/shared-llm/src/utils/index.ts b/packages/shared-llm/src/utils/index.ts index 1f345070f..1466b2de8 100644 --- a/packages/shared-llm/src/utils/index.ts +++ b/packages/shared-llm/src/utils/index.ts @@ -1,3 +1,5 @@ export { extractJson } from './json-extractor'; export { retryFetch } from './retry'; export type { RetryOptions } from './retry'; +export { LlmMetricsCollector } from './metrics'; +export type { LlmRequestMetrics, MetricsCallback } from './metrics'; diff --git a/packages/shared-llm/src/utils/metrics.ts b/packages/shared-llm/src/utils/metrics.ts new file mode 100644 index 000000000..3751994d3 --- /dev/null +++ b/packages/shared-llm/src/utils/metrics.ts @@ -0,0 +1,88 @@ +/** + * Request-level metrics for LLM calls. + * + * Provides an optional callback system that backends can hook into + * for monitoring, logging, or forwarding to Prometheus/Grafana. + */ + +export interface LlmRequestMetrics { + /** Model requested (e.g. "ollama/gemma3:4b") */ + model: string; + /** Model actually used (may differ if fallback occurred) */ + actualModel: string; + /** Request type */ + type: 'chat' | 'json' | 'vision' | 'visionJson' | 'embed' | 'stream'; + /** Total request duration in ms */ + latencyMs: number; + /** Token usage */ + promptTokens: number; + completionTokens: number; + totalTokens: number; + /** Whether this request was a fallback (model differs from requested) */ + wasFallback: boolean; + /** Whether the request succeeded */ + success: boolean; + /** Error message if failed */ + error?: string; +} + +export type MetricsCallback = (metrics: LlmRequestMetrics) => void; + +/** + * Simple in-memory metrics aggregator. + * Useful for health endpoints and debugging. + */ +export class LlmMetricsCollector { + private _totalRequests = 0; + private _totalErrors = 0; + private _totalFallbacks = 0; + private _totalTokens = 0; + private _totalLatencyMs = 0; + private _byModel: Map = new Map(); + + /** Use as MetricsCallback */ + readonly collect = (metrics: LlmRequestMetrics): void => { + this._totalRequests++; + this._totalLatencyMs += metrics.latencyMs; + this._totalTokens += metrics.totalTokens; + + if (!metrics.success) this._totalErrors++; + if (metrics.wasFallback) this._totalFallbacks++; + + const modelKey = metrics.actualModel; + const existing = this._byModel.get(modelKey) ?? { requests: 0, tokens: 0, errors: 0 }; + existing.requests++; + existing.tokens += metrics.totalTokens; + if (!metrics.success) existing.errors++; + this._byModel.set(modelKey, existing); + }; + + /** Get summary stats for health endpoints / dashboards */ + getSummary() { + return { + totalRequests: this._totalRequests, + totalErrors: this._totalErrors, + totalFallbacks: this._totalFallbacks, + totalTokens: this._totalTokens, + averageLatencyMs: + this._totalRequests > 0 ? Math.round(this._totalLatencyMs / this._totalRequests) : 0, + fallbackRate: + this._totalRequests > 0 + ? Math.round((this._totalFallbacks / this._totalRequests) * 100) + : 0, + errorRate: + this._totalRequests > 0 ? Math.round((this._totalErrors / this._totalRequests) * 100) : 0, + byModel: Object.fromEntries(this._byModel), + }; + } + + /** Reset all counters */ + reset(): void { + this._totalRequests = 0; + this._totalErrors = 0; + this._totalFallbacks = 0; + this._totalTokens = 0; + this._totalLatencyMs = 0; + this._byModel.clear(); + } +} diff --git a/scripts/mac-mini/configure-ollama.sh b/scripts/mac-mini/configure-ollama.sh new file mode 100755 index 000000000..33d12cfcd --- /dev/null +++ b/scripts/mac-mini/configure-ollama.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# Configure Ollama for optimal memory usage on Mac Mini +# +# Sets OLLAMA_KEEP_ALIVE=5m so models unload from RAM after 5 minutes +# of inactivity. This is critical on the 16GB Mac Mini where Ollama +# models can consume 3-16 GB RAM. +# +# Run on the Mac Mini: +# ./scripts/mac-mini/configure-ollama.sh + +set -e + +PLIST_DIR="$HOME/Library/LaunchAgents" +OLLAMA_PLIST="$PLIST_DIR/homebrew.mxcl.ollama.plist" + +echo "=== Ollama Memory Optimization ===" +echo "" + +# Check if Ollama is installed +if ! command -v ollama &>/dev/null && [ ! -f /opt/homebrew/bin/ollama ]; then + echo "ERROR: Ollama not found. Install with: brew install ollama" + exit 1 +fi + +# Create override plist that sets environment variables +# This is the recommended way to add env vars to a Homebrew service +OVERRIDE_PLIST="$PLIST_DIR/com.manacore.ollama-env.plist" + +cat > "$OVERRIDE_PLIST" << 'PLIST' + + + + + Label + com.manacore.ollama-env + ProgramArguments + + /bin/bash + -c + + # Set Ollama environment variables system-wide via launchctl + launchctl setenv OLLAMA_KEEP_ALIVE 5m + launchctl setenv OLLAMA_FLASH_ATTENTION 1 + launchctl setenv OLLAMA_KV_CACHE_TYPE q8_0 + launchctl setenv OLLAMA_NUM_PARALLEL 1 + launchctl setenv OLLAMA_MAX_LOADED_MODELS 1 + + + RunAtLoad + + + +PLIST + +echo "Created: $OVERRIDE_PLIST" + +# Apply immediately (no reboot needed) +launchctl setenv OLLAMA_KEEP_ALIVE 5m +launchctl setenv OLLAMA_FLASH_ATTENTION 1 +launchctl setenv OLLAMA_KV_CACHE_TYPE q8_0 +launchctl setenv OLLAMA_NUM_PARALLEL 1 +launchctl setenv OLLAMA_MAX_LOADED_MODELS 1 + +echo "" +echo "Environment variables set:" +echo " OLLAMA_KEEP_ALIVE=5m (unload models after 5min idle → saves 3-16GB RAM)" +echo " OLLAMA_FLASH_ATTENTION=1 (faster attention computation)" +echo " OLLAMA_KV_CACHE_TYPE=q8_0 (efficient KV cache)" +echo " OLLAMA_NUM_PARALLEL=1 (max 1 parallel request → predictable memory)" +echo " OLLAMA_MAX_LOADED_MODELS=1 (max 1 model in RAM at a time)" +echo "" + +# Restart Ollama to pick up new settings +echo "Restarting Ollama..." +/opt/homebrew/bin/brew services restart ollama 2>/dev/null || { + echo "Homebrew restart failed, trying launchctl..." + launchctl stop homebrew.mxcl.ollama 2>/dev/null + sleep 2 + launchctl start homebrew.mxcl.ollama 2>/dev/null +} + +echo "" +echo "Done! Verify with:" +echo " ollama ps # Should show no loaded models (or model with 5m timeout)" +echo " curl localhost:11434/api/ps # Same via API" +echo "" +echo "Expected behavior:" +echo " - First request: ~2-5s cold start (model loads into RAM)" +echo " - Subsequent requests within 5min: instant (model in RAM)" +echo " - After 5min idle: model unloads, RAM freed"