mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-23 15:26:42 +02:00
feat: add Ollama memory optimization, LLM metrics, and chat streaming
Three improvements to the unified LLM infrastructure: 1. Ollama memory optimization (scripts/mac-mini/configure-ollama.sh): - OLLAMA_KEEP_ALIVE=5m → models unload after 5min idle (saves 3-16GB RAM) - OLLAMA_NUM_PARALLEL=1 → predictable memory usage - OLLAMA_MAX_LOADED_MODELS=1 → max 1 model in RAM at a time 2. Request-level metrics in @manacore/shared-llm: - LlmRequestMetrics interface (model, latency, tokens, fallback detection) - LlmMetricsCollector class with summary stats (for health endpoints) - Optional onMetrics callback in LlmModuleOptions - Automatic metrics emission in chatMessages() (success + error) 3. Chat streaming (token-by-token SSE): - Backend: POST /chat/completions/stream SSE endpoint - OllamaService.createStreamingCompletion() via llm.chatStreamMessages() - ChatService.createStreamingCompletion() with upfront credit consumption - Web: chatApi.createStreamingCompletion() SSE consumer - Chat store: sendMessage() now streams tokens into assistant message - UI updates reactively as each token arrives Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ecda4535d8
commit
56ffcbac39
13 changed files with 462 additions and 29 deletions
|
|
@ -1,3 +1,5 @@
|
|||
export { extractJson } from './json-extractor';
|
||||
export { retryFetch } from './retry';
|
||||
export type { RetryOptions } from './retry';
|
||||
export { LlmMetricsCollector } from './metrics';
|
||||
export type { LlmRequestMetrics, MetricsCallback } from './metrics';
|
||||
|
|
|
|||
88
packages/shared-llm/src/utils/metrics.ts
Normal file
88
packages/shared-llm/src/utils/metrics.ts
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
/**
|
||||
* Request-level metrics for LLM calls.
|
||||
*
|
||||
* Provides an optional callback system that backends can hook into
|
||||
* for monitoring, logging, or forwarding to Prometheus/Grafana.
|
||||
*/
|
||||
|
||||
export interface LlmRequestMetrics {
|
||||
/** Model requested (e.g. "ollama/gemma3:4b") */
|
||||
model: string;
|
||||
/** Model actually used (may differ if fallback occurred) */
|
||||
actualModel: string;
|
||||
/** Request type */
|
||||
type: 'chat' | 'json' | 'vision' | 'visionJson' | 'embed' | 'stream';
|
||||
/** Total request duration in ms */
|
||||
latencyMs: number;
|
||||
/** Token usage */
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
totalTokens: number;
|
||||
/** Whether this request was a fallback (model differs from requested) */
|
||||
wasFallback: boolean;
|
||||
/** Whether the request succeeded */
|
||||
success: boolean;
|
||||
/** Error message if failed */
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type MetricsCallback = (metrics: LlmRequestMetrics) => void;
|
||||
|
||||
/**
|
||||
* Simple in-memory metrics aggregator.
|
||||
* Useful for health endpoints and debugging.
|
||||
*/
|
||||
export class LlmMetricsCollector {
|
||||
private _totalRequests = 0;
|
||||
private _totalErrors = 0;
|
||||
private _totalFallbacks = 0;
|
||||
private _totalTokens = 0;
|
||||
private _totalLatencyMs = 0;
|
||||
private _byModel: Map<string, { requests: number; tokens: number; errors: number }> = new Map();
|
||||
|
||||
/** Use as MetricsCallback */
|
||||
readonly collect = (metrics: LlmRequestMetrics): void => {
|
||||
this._totalRequests++;
|
||||
this._totalLatencyMs += metrics.latencyMs;
|
||||
this._totalTokens += metrics.totalTokens;
|
||||
|
||||
if (!metrics.success) this._totalErrors++;
|
||||
if (metrics.wasFallback) this._totalFallbacks++;
|
||||
|
||||
const modelKey = metrics.actualModel;
|
||||
const existing = this._byModel.get(modelKey) ?? { requests: 0, tokens: 0, errors: 0 };
|
||||
existing.requests++;
|
||||
existing.tokens += metrics.totalTokens;
|
||||
if (!metrics.success) existing.errors++;
|
||||
this._byModel.set(modelKey, existing);
|
||||
};
|
||||
|
||||
/** Get summary stats for health endpoints / dashboards */
|
||||
getSummary() {
|
||||
return {
|
||||
totalRequests: this._totalRequests,
|
||||
totalErrors: this._totalErrors,
|
||||
totalFallbacks: this._totalFallbacks,
|
||||
totalTokens: this._totalTokens,
|
||||
averageLatencyMs:
|
||||
this._totalRequests > 0 ? Math.round(this._totalLatencyMs / this._totalRequests) : 0,
|
||||
fallbackRate:
|
||||
this._totalRequests > 0
|
||||
? Math.round((this._totalFallbacks / this._totalRequests) * 100)
|
||||
: 0,
|
||||
errorRate:
|
||||
this._totalRequests > 0 ? Math.round((this._totalErrors / this._totalRequests) * 100) : 0,
|
||||
byModel: Object.fromEntries(this._byModel),
|
||||
};
|
||||
}
|
||||
|
||||
/** Reset all counters */
|
||||
reset(): void {
|
||||
this._totalRequests = 0;
|
||||
this._totalErrors = 0;
|
||||
this._totalFallbacks = 0;
|
||||
this._totalTokens = 0;
|
||||
this._totalLatencyMs = 0;
|
||||
this._byModel.clear();
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue