mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-16 08:39:40 +02:00
feat: add Ollama memory optimization, LLM metrics, and chat streaming
Three improvements to the unified LLM infrastructure: 1. Ollama memory optimization (scripts/mac-mini/configure-ollama.sh): - OLLAMA_KEEP_ALIVE=5m → models unload after 5min idle (saves 3-16GB RAM) - OLLAMA_NUM_PARALLEL=1 → predictable memory usage - OLLAMA_MAX_LOADED_MODELS=1 → max 1 model in RAM at a time 2. Request-level metrics in @manacore/shared-llm: - LlmRequestMetrics interface (model, latency, tokens, fallback detection) - LlmMetricsCollector class with summary stats (for health endpoints) - Optional onMetrics callback in LlmModuleOptions - Automatic metrics emission in chatMessages() (success + error) 3. Chat streaming (token-by-token SSE): - Backend: POST /chat/completions/stream SSE endpoint - OllamaService.createStreamingCompletion() via llm.chatStreamMessages() - ChatService.createStreamingCompletion() with upfront credit consumption - Web: chatApi.createStreamingCompletion() SSE consumer - Chat store: sendMessage() now streams tokens into assistant message - UI updates reactively as each token arrives Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ecda4535d8
commit
56ffcbac39
13 changed files with 462 additions and 29 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import { Body, Controller, Get, Post, UseGuards } from '@nestjs/common';
|
||||
import { Body, Controller, Get, Post, Res, UseGuards } from '@nestjs/common';
|
||||
import type { Response } from 'express';
|
||||
import { isOk } from '@manacore/shared-errors';
|
||||
import { ChatService } from './chat.service';
|
||||
import { ChatCompletionDto } from './dto/chat-completion.dto';
|
||||
|
|
@ -24,9 +25,33 @@ export class ChatController {
|
|||
const result = await this.chatService.createCompletion(dto, user.userId);
|
||||
|
||||
if (!isOk(result)) {
|
||||
throw result.error; // Caught by AppExceptionFilter
|
||||
throw result.error;
|
||||
}
|
||||
|
||||
return result.value;
|
||||
}
|
||||
|
||||
@Post('completions/stream')
|
||||
async createStreamingCompletion(
|
||||
@Body() dto: ChatCompletionDto,
|
||||
@CurrentUser() user: CurrentUserData,
|
||||
@Res() res: Response
|
||||
): Promise<void> {
|
||||
res.setHeader('Content-Type', 'text/event-stream');
|
||||
res.setHeader('Cache-Control', 'no-cache');
|
||||
res.setHeader('Connection', 'keep-alive');
|
||||
res.setHeader('X-Accel-Buffering', 'no');
|
||||
|
||||
try {
|
||||
for await (const token of this.chatService.createStreamingCompletion(dto, user.userId)) {
|
||||
res.write(`data: ${JSON.stringify({ token })}\n\n`);
|
||||
}
|
||||
res.write('data: [DONE]\n\n');
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : 'Stream failed';
|
||||
res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
|
||||
} finally {
|
||||
res.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -159,6 +159,68 @@ export class ChatService {
|
|||
return params?.model || model.provider;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a streaming completion. Yields text tokens as they arrive.
|
||||
* Credits are consumed upfront (estimated cost) since we don't know final token count.
|
||||
*/
|
||||
async *createStreamingCompletion(dto: ChatCompletionDto, userId?: string): AsyncIterable<string> {
|
||||
const model = await this.getModelById(dto.modelId);
|
||||
if (!model) {
|
||||
throw new Error(`Model ${dto.modelId} not found`);
|
||||
}
|
||||
|
||||
// Consume credits upfront for streaming
|
||||
if (userId) {
|
||||
const creditOperation = this.getCreditOperationForModel(model);
|
||||
const creditCost = CREDIT_COSTS[creditOperation];
|
||||
|
||||
const validation = await this.creditClient.validateCredits(
|
||||
userId,
|
||||
creditOperation,
|
||||
creditCost
|
||||
);
|
||||
if (!validation.hasCredits) {
|
||||
throw new Error(
|
||||
`Insufficient credits: need ${creditCost}, have ${validation.availableCredits}`
|
||||
);
|
||||
}
|
||||
|
||||
await this.creditClient.consumeCredits(
|
||||
userId,
|
||||
creditOperation,
|
||||
creditCost,
|
||||
`Chat stream with ${this.getModelDisplayName(model)}`,
|
||||
{ modelId: dto.modelId, provider: model.provider, streaming: true }
|
||||
);
|
||||
}
|
||||
|
||||
const params = model.parameters as {
|
||||
model?: string;
|
||||
temperature?: number;
|
||||
max_tokens?: number;
|
||||
} | null;
|
||||
const modelName = params?.model || 'gemma3:4b';
|
||||
const prefixedModel =
|
||||
model.provider === 'openrouter'
|
||||
? modelName.includes('/')
|
||||
? `openrouter/${modelName}`
|
||||
: modelName
|
||||
: modelName;
|
||||
|
||||
const temperature = dto.temperature ?? params?.temperature ?? 0.7;
|
||||
const maxTokens = dto.maxTokens ?? params?.max_tokens ?? 4096;
|
||||
|
||||
yield* this.ollamaService.createStreamingCompletion(
|
||||
prefixedModel,
|
||||
dto.messages.map((msg) => ({
|
||||
role: msg.role as 'system' | 'user' | 'assistant',
|
||||
content: msg.content,
|
||||
})),
|
||||
temperature,
|
||||
maxTokens
|
||||
);
|
||||
}
|
||||
|
||||
private async createOllamaCompletion(
|
||||
model: Model,
|
||||
dto: ChatCompletionDto
|
||||
|
|
|
|||
|
|
@ -89,6 +89,22 @@ export class OllamaService {
|
|||
}
|
||||
}
|
||||
|
||||
async *createStreamingCompletion(
|
||||
modelName: string,
|
||||
messages: ChatMessage[],
|
||||
temperature?: number,
|
||||
maxTokens?: number
|
||||
): AsyncIterable<string> {
|
||||
const normalizedModel = modelName.includes('/') ? modelName : `ollama/${modelName}`;
|
||||
this.logger.log(`Streaming request to mana-llm model: ${normalizedModel}`);
|
||||
|
||||
yield* this.llm.chatStreamMessages(messages, {
|
||||
model: normalizedModel,
|
||||
temperature,
|
||||
maxTokens,
|
||||
});
|
||||
}
|
||||
|
||||
async listModels(): Promise<string[]> {
|
||||
try {
|
||||
const models = await this.llm.listModels();
|
||||
|
|
|
|||
|
|
@ -595,4 +595,68 @@ export const chatApi = {
|
|||
}
|
||||
return data;
|
||||
},
|
||||
|
||||
/**
|
||||
* Create a streaming completion. Returns an async generator of text tokens.
|
||||
* Uses Server-Sent Events (SSE) for real-time token delivery.
|
||||
*/
|
||||
async *createStreamingCompletion(options: {
|
||||
messages: ChatMessage[];
|
||||
modelId: string;
|
||||
temperature?: number;
|
||||
maxTokens?: number;
|
||||
}): AsyncGenerator<string> {
|
||||
const authToken = await authStore.getValidToken();
|
||||
if (!authToken) throw new Error('No authentication token');
|
||||
|
||||
const response = await fetch(`${API_BASE}/api/v1/chat/completions/stream`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
Authorization: `Bearer ${authToken}`,
|
||||
},
|
||||
body: JSON.stringify(options),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Stream error: ${response.status}`);
|
||||
}
|
||||
|
||||
if (!response.body) {
|
||||
throw new Error('No response body for stream');
|
||||
}
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = '';
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
const lines = buffer.split('\n');
|
||||
buffer = lines.pop() ?? '';
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed || !trimmed.startsWith('data: ')) continue;
|
||||
|
||||
const data = trimmed.slice(6);
|
||||
if (data === '[DONE]') return;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(data);
|
||||
if (parsed.error) throw new Error(parsed.error);
|
||||
if (parsed.token) yield parsed.token;
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message !== data) throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
reader.releaseLock();
|
||||
}
|
||||
},
|
||||
};
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ export const chatService = {
|
|||
},
|
||||
|
||||
/**
|
||||
* Send chat completion request
|
||||
* Send chat completion request (non-streaming)
|
||||
*/
|
||||
async createCompletion(request: ChatCompletionRequest): Promise<ChatCompletionResponse | null> {
|
||||
return chatApi.createCompletion({
|
||||
|
|
@ -38,4 +38,17 @@ export const chatService = {
|
|||
maxTokens: request.maxTokens ?? 1000,
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* Send streaming chat completion request.
|
||||
* Returns an async generator that yields text tokens as they arrive.
|
||||
*/
|
||||
async *createStreamingCompletion(request: ChatCompletionRequest): AsyncGenerator<string> {
|
||||
yield* chatApi.createStreamingCompletion({
|
||||
messages: request.messages,
|
||||
modelId: request.modelId,
|
||||
temperature: request.temperature ?? 0.7,
|
||||
maxTokens: request.maxTokens ?? 1000,
|
||||
});
|
||||
},
|
||||
};
|
||||
|
|
|
|||
|
|
@ -80,36 +80,53 @@ export const chatStore = {
|
|||
};
|
||||
messages = [...messages, userMessage];
|
||||
|
||||
// Add placeholder assistant message for streaming
|
||||
const assistantId = `temp-${++messageCounter}`;
|
||||
const assistantMessage: Message = {
|
||||
id: assistantId,
|
||||
conversationId: '',
|
||||
sender: 'assistant',
|
||||
messageText: '',
|
||||
createdAt: new Date().toISOString(),
|
||||
};
|
||||
messages = [...messages, assistantMessage];
|
||||
|
||||
try {
|
||||
// Build chat messages for API
|
||||
const chatMessages: ChatMessage[] = messages.map((m) => ({
|
||||
role: m.sender === 'user' ? 'user' : 'assistant',
|
||||
content: m.messageText,
|
||||
}));
|
||||
const chatMessages: ChatMessage[] = messages
|
||||
.filter((m) => m.id !== assistantId)
|
||||
.map((m) => ({
|
||||
role: m.sender === 'user' ? 'user' : 'assistant',
|
||||
content: m.messageText,
|
||||
}));
|
||||
|
||||
const request: ChatCompletionRequest = {
|
||||
messages: chatMessages,
|
||||
modelId: selectedModelId,
|
||||
};
|
||||
|
||||
const response = await chatService.createCompletion(request);
|
||||
// Stream tokens into the assistant message
|
||||
let fullContent = '';
|
||||
for await (const token of chatService.createStreamingCompletion(request)) {
|
||||
fullContent += token;
|
||||
// Update the assistant message reactively
|
||||
messages = messages.map((m) =>
|
||||
m.id === assistantId ? { ...m, messageText: fullContent } : m
|
||||
);
|
||||
}
|
||||
|
||||
if (response) {
|
||||
// Add assistant message
|
||||
const assistantMessage: Message = {
|
||||
id: `temp-${++messageCounter}`,
|
||||
conversationId: '',
|
||||
sender: 'assistant',
|
||||
messageText: response.content,
|
||||
createdAt: new Date().toISOString(),
|
||||
};
|
||||
messages = [...messages, assistantMessage];
|
||||
ChatEvents.messageSent(selectedModelId);
|
||||
} else {
|
||||
if (!fullContent) {
|
||||
error = 'Failed to get response';
|
||||
messages = messages.filter((m) => m.id !== assistantId);
|
||||
} else {
|
||||
ChatEvents.messageSent(selectedModelId);
|
||||
}
|
||||
} catch (e) {
|
||||
error = e instanceof Error ? e.message : 'Failed to send message';
|
||||
// Remove empty assistant message on error
|
||||
const msg = messages.find((m) => m.id === assistantId);
|
||||
if (msg && !msg.messageText) {
|
||||
messages = messages.filter((m) => m.id !== assistantId);
|
||||
}
|
||||
} finally {
|
||||
isSending = false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -594,8 +594,13 @@ Systemeinstellungen → Datenschutz & Sicherheit → Voller Festplattenzugriff
|
|||
**LaunchAgent:** `~/Library/LaunchAgents/homebrew.mxcl.ollama.plist`
|
||||
|
||||
Optimierungen bereits aktiviert:
|
||||
- `OLLAMA_KEEP_ALIVE=5m` - Modelle nach 5min Inaktivität aus RAM entladen (spart 3-16 GB)
|
||||
- `OLLAMA_FLASH_ATTENTION=1` - Schnellere Attention-Berechnung
|
||||
- `OLLAMA_KV_CACHE_TYPE=q8_0` - Effizienterer KV-Cache
|
||||
- `OLLAMA_NUM_PARALLEL=1` - Max 1 paralleler Request (vorhersagbarer RAM)
|
||||
- `OLLAMA_MAX_LOADED_MODELS=1` - Max 1 Modell gleichzeitig im RAM
|
||||
|
||||
Setup-Script: `./scripts/mac-mini/configure-ollama.sh`
|
||||
|
||||
### Speicherort
|
||||
|
||||
|
|
|
|||
|
|
@ -33,3 +33,7 @@ export type {
|
|||
|
||||
// Utilities
|
||||
export { extractJson } from './utils';
|
||||
|
||||
// Metrics
|
||||
export { LlmMetricsCollector } from './utils';
|
||||
export type { LlmRequestMetrics, MetricsCallback } from './utils';
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import type { ModuleMetadata, Type } from '@nestjs/common';
|
||||
import type { MetricsCallback } from '../utils/metrics';
|
||||
|
||||
export interface LlmModuleOptions {
|
||||
/** mana-llm service URL (default: http://localhost:3025) */
|
||||
|
|
@ -13,6 +14,8 @@ export interface LlmModuleOptions {
|
|||
maxRetries?: number;
|
||||
/** Enable debug logging (default: false) */
|
||||
debug?: boolean;
|
||||
/** Optional callback invoked after every LLM request with metrics */
|
||||
onMetrics?: MetricsCallback;
|
||||
}
|
||||
|
||||
export interface LlmModuleAsyncOptions extends Pick<ModuleMetadata, 'imports'> {
|
||||
|
|
@ -33,6 +36,7 @@ export interface ResolvedLlmOptions {
|
|||
timeout: number;
|
||||
maxRetries: number;
|
||||
debug: boolean;
|
||||
onMetrics?: MetricsCallback;
|
||||
}
|
||||
|
||||
export function resolveOptions(options: LlmModuleOptions): ResolvedLlmOptions {
|
||||
|
|
@ -43,5 +47,6 @@ export function resolveOptions(options: LlmModuleOptions): ResolvedLlmOptions {
|
|||
timeout: options.timeout ?? 120_000,
|
||||
maxRetries: options.maxRetries ?? 2,
|
||||
debug: options.debug ?? false,
|
||||
onMetrics: options.onMetrics,
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ import type {
|
|||
ChatCompletionResponse,
|
||||
EmbeddingResponse,
|
||||
} from './types/openai-compat.types';
|
||||
import type { LlmRequestMetrics } from './utils/metrics';
|
||||
import { extractJson } from './utils/json-extractor';
|
||||
import { retryFetch } from './utils/retry';
|
||||
|
||||
|
|
@ -52,17 +53,48 @@ export class LlmClient {
|
|||
|
||||
/** Chat with full message history. */
|
||||
async chatMessages(messages: ChatMessage[], opts?: ChatOptions): Promise<ChatResult> {
|
||||
const requestedModel = opts?.model ?? this.options.defaultModel;
|
||||
const body = this.buildRequest(messages, opts, false);
|
||||
const start = Date.now();
|
||||
const response = await this.fetchCompletion(body, opts?.timeout);
|
||||
const latencyMs = Date.now() - start;
|
||||
|
||||
return {
|
||||
content: response.choices[0]?.message?.content ?? '',
|
||||
model: response.model,
|
||||
usage: response.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 },
|
||||
latencyMs,
|
||||
};
|
||||
try {
|
||||
const response = await this.fetchCompletion(body, opts?.timeout);
|
||||
const latencyMs = Date.now() - start;
|
||||
const usage = response.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
|
||||
|
||||
this.emitMetrics({
|
||||
model: requestedModel,
|
||||
actualModel: response.model,
|
||||
type: 'chat',
|
||||
latencyMs,
|
||||
promptTokens: usage.prompt_tokens,
|
||||
completionTokens: usage.completion_tokens,
|
||||
totalTokens: usage.total_tokens,
|
||||
wasFallback: response.model !== requestedModel && !response.model.endsWith(requestedModel),
|
||||
success: true,
|
||||
});
|
||||
|
||||
return {
|
||||
content: response.choices[0]?.message?.content ?? '',
|
||||
model: response.model,
|
||||
usage,
|
||||
latencyMs,
|
||||
};
|
||||
} catch (error) {
|
||||
this.emitMetrics({
|
||||
model: requestedModel,
|
||||
actualModel: requestedModel,
|
||||
type: 'chat',
|
||||
latencyMs: Date.now() - start,
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
totalTokens: 0,
|
||||
wasFallback: false,
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -347,4 +379,14 @@ export class LlmClient {
|
|||
|
||||
return (await response.json()) as ChatCompletionResponse;
|
||||
}
|
||||
|
||||
private emitMetrics(metrics: LlmRequestMetrics): void {
|
||||
if (this.options.onMetrics) {
|
||||
try {
|
||||
this.options.onMetrics(metrics);
|
||||
} catch {
|
||||
// Never let metrics callback break the request
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
export { extractJson } from './json-extractor';
|
||||
export { retryFetch } from './retry';
|
||||
export type { RetryOptions } from './retry';
|
||||
export { LlmMetricsCollector } from './metrics';
|
||||
export type { LlmRequestMetrics, MetricsCallback } from './metrics';
|
||||
|
|
|
|||
88
packages/shared-llm/src/utils/metrics.ts
Normal file
88
packages/shared-llm/src/utils/metrics.ts
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
/**
|
||||
* Request-level metrics for LLM calls.
|
||||
*
|
||||
* Provides an optional callback system that backends can hook into
|
||||
* for monitoring, logging, or forwarding to Prometheus/Grafana.
|
||||
*/
|
||||
|
||||
export interface LlmRequestMetrics {
|
||||
/** Model requested (e.g. "ollama/gemma3:4b") */
|
||||
model: string;
|
||||
/** Model actually used (may differ if fallback occurred) */
|
||||
actualModel: string;
|
||||
/** Request type */
|
||||
type: 'chat' | 'json' | 'vision' | 'visionJson' | 'embed' | 'stream';
|
||||
/** Total request duration in ms */
|
||||
latencyMs: number;
|
||||
/** Token usage */
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
totalTokens: number;
|
||||
/** Whether this request was a fallback (model differs from requested) */
|
||||
wasFallback: boolean;
|
||||
/** Whether the request succeeded */
|
||||
success: boolean;
|
||||
/** Error message if failed */
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type MetricsCallback = (metrics: LlmRequestMetrics) => void;
|
||||
|
||||
/**
|
||||
* Simple in-memory metrics aggregator.
|
||||
* Useful for health endpoints and debugging.
|
||||
*/
|
||||
export class LlmMetricsCollector {
|
||||
private _totalRequests = 0;
|
||||
private _totalErrors = 0;
|
||||
private _totalFallbacks = 0;
|
||||
private _totalTokens = 0;
|
||||
private _totalLatencyMs = 0;
|
||||
private _byModel: Map<string, { requests: number; tokens: number; errors: number }> = new Map();
|
||||
|
||||
/** Use as MetricsCallback */
|
||||
readonly collect = (metrics: LlmRequestMetrics): void => {
|
||||
this._totalRequests++;
|
||||
this._totalLatencyMs += metrics.latencyMs;
|
||||
this._totalTokens += metrics.totalTokens;
|
||||
|
||||
if (!metrics.success) this._totalErrors++;
|
||||
if (metrics.wasFallback) this._totalFallbacks++;
|
||||
|
||||
const modelKey = metrics.actualModel;
|
||||
const existing = this._byModel.get(modelKey) ?? { requests: 0, tokens: 0, errors: 0 };
|
||||
existing.requests++;
|
||||
existing.tokens += metrics.totalTokens;
|
||||
if (!metrics.success) existing.errors++;
|
||||
this._byModel.set(modelKey, existing);
|
||||
};
|
||||
|
||||
/** Get summary stats for health endpoints / dashboards */
|
||||
getSummary() {
|
||||
return {
|
||||
totalRequests: this._totalRequests,
|
||||
totalErrors: this._totalErrors,
|
||||
totalFallbacks: this._totalFallbacks,
|
||||
totalTokens: this._totalTokens,
|
||||
averageLatencyMs:
|
||||
this._totalRequests > 0 ? Math.round(this._totalLatencyMs / this._totalRequests) : 0,
|
||||
fallbackRate:
|
||||
this._totalRequests > 0
|
||||
? Math.round((this._totalFallbacks / this._totalRequests) * 100)
|
||||
: 0,
|
||||
errorRate:
|
||||
this._totalRequests > 0 ? Math.round((this._totalErrors / this._totalRequests) * 100) : 0,
|
||||
byModel: Object.fromEntries(this._byModel),
|
||||
};
|
||||
}
|
||||
|
||||
/** Reset all counters */
|
||||
reset(): void {
|
||||
this._totalRequests = 0;
|
||||
this._totalErrors = 0;
|
||||
this._totalFallbacks = 0;
|
||||
this._totalTokens = 0;
|
||||
this._totalLatencyMs = 0;
|
||||
this._byModel.clear();
|
||||
}
|
||||
}
|
||||
90
scripts/mac-mini/configure-ollama.sh
Executable file
90
scripts/mac-mini/configure-ollama.sh
Executable file
|
|
@ -0,0 +1,90 @@
|
|||
#!/bin/bash
|
||||
# Configure Ollama for optimal memory usage on Mac Mini
|
||||
#
|
||||
# Sets OLLAMA_KEEP_ALIVE=5m so models unload from RAM after 5 minutes
|
||||
# of inactivity. This is critical on the 16GB Mac Mini where Ollama
|
||||
# models can consume 3-16 GB RAM.
|
||||
#
|
||||
# Run on the Mac Mini:
|
||||
# ./scripts/mac-mini/configure-ollama.sh
|
||||
|
||||
set -e
|
||||
|
||||
PLIST_DIR="$HOME/Library/LaunchAgents"
|
||||
OLLAMA_PLIST="$PLIST_DIR/homebrew.mxcl.ollama.plist"
|
||||
|
||||
echo "=== Ollama Memory Optimization ==="
|
||||
echo ""
|
||||
|
||||
# Check if Ollama is installed
|
||||
if ! command -v ollama &>/dev/null && [ ! -f /opt/homebrew/bin/ollama ]; then
|
||||
echo "ERROR: Ollama not found. Install with: brew install ollama"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create override plist that sets environment variables
|
||||
# This is the recommended way to add env vars to a Homebrew service
|
||||
OVERRIDE_PLIST="$PLIST_DIR/com.manacore.ollama-env.plist"
|
||||
|
||||
cat > "$OVERRIDE_PLIST" << 'PLIST'
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.manacore.ollama-env</string>
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>-c</string>
|
||||
<string>
|
||||
# Set Ollama environment variables system-wide via launchctl
|
||||
launchctl setenv OLLAMA_KEEP_ALIVE 5m
|
||||
launchctl setenv OLLAMA_FLASH_ATTENTION 1
|
||||
launchctl setenv OLLAMA_KV_CACHE_TYPE q8_0
|
||||
launchctl setenv OLLAMA_NUM_PARALLEL 1
|
||||
launchctl setenv OLLAMA_MAX_LOADED_MODELS 1
|
||||
</string>
|
||||
</array>
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
</dict>
|
||||
</plist>
|
||||
PLIST
|
||||
|
||||
echo "Created: $OVERRIDE_PLIST"
|
||||
|
||||
# Apply immediately (no reboot needed)
|
||||
launchctl setenv OLLAMA_KEEP_ALIVE 5m
|
||||
launchctl setenv OLLAMA_FLASH_ATTENTION 1
|
||||
launchctl setenv OLLAMA_KV_CACHE_TYPE q8_0
|
||||
launchctl setenv OLLAMA_NUM_PARALLEL 1
|
||||
launchctl setenv OLLAMA_MAX_LOADED_MODELS 1
|
||||
|
||||
echo ""
|
||||
echo "Environment variables set:"
|
||||
echo " OLLAMA_KEEP_ALIVE=5m (unload models after 5min idle → saves 3-16GB RAM)"
|
||||
echo " OLLAMA_FLASH_ATTENTION=1 (faster attention computation)"
|
||||
echo " OLLAMA_KV_CACHE_TYPE=q8_0 (efficient KV cache)"
|
||||
echo " OLLAMA_NUM_PARALLEL=1 (max 1 parallel request → predictable memory)"
|
||||
echo " OLLAMA_MAX_LOADED_MODELS=1 (max 1 model in RAM at a time)"
|
||||
echo ""
|
||||
|
||||
# Restart Ollama to pick up new settings
|
||||
echo "Restarting Ollama..."
|
||||
/opt/homebrew/bin/brew services restart ollama 2>/dev/null || {
|
||||
echo "Homebrew restart failed, trying launchctl..."
|
||||
launchctl stop homebrew.mxcl.ollama 2>/dev/null
|
||||
sleep 2
|
||||
launchctl start homebrew.mxcl.ollama 2>/dev/null
|
||||
}
|
||||
|
||||
echo ""
|
||||
echo "Done! Verify with:"
|
||||
echo " ollama ps # Should show no loaded models (or model with 5m timeout)"
|
||||
echo " curl localhost:11434/api/ps # Same via API"
|
||||
echo ""
|
||||
echo "Expected behavior:"
|
||||
echo " - First request: ~2-5s cold start (model loads into RAM)"
|
||||
echo " - Subsequent requests within 5min: instant (model in RAM)"
|
||||
echo " - After 5min idle: model unloads, RAM freed"
|
||||
Loading…
Add table
Add a link
Reference in a new issue