feat(manacore/web): add benchmark, compare history, markdown & cache status to LLM test

- Benchmark tab: run same prompt N times against loaded model, show avg/min/max/median for latency and tok/s with per-run table and bars - Compare history: save results to localStorage (max 20), restore or delete previous comparisons via History panel - Markdown rendering: use marked for assistant responses in chat tab and compare outputs with proper code blocks, lists, headings - Model cache status: check browser Cache API via WebLLM hasModelInCache, show cached/uncached per model with size badges - Add marked dependency Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 20:01:09 +02:00 · 2026-04-02 11:50:23 +02:00 · 2026-04-02 11:50:23 +02:00 · c3c02c6a22
commit c3c02c6a22
parent 348b6ff231
2 changed files with 523 additions and 114 deletions
--- a/apps/manacore/apps/web/package.json
+++ b/apps/manacore/apps/web/package.json
@ -42,9 +42,13 @@
 		"vitest": "^4.0.14"
 	},
 	"dependencies": {
+		"@calc/shared": "workspace:*",
+		"@clock/shared": "workspace:*",
 		"@manacore/credits": "workspace:^",
+		"@manacore/feedback": "workspace:*",
+		"@manacore/help": "workspace:*",
+		"@manacore/local-llm": "workspace:*",
 		"@manacore/local-store": "workspace:*",
-		"@manacore/shared-links": "workspace:*",
 		"@manacore/qr-export": "workspace:*",
 		"@manacore/shared-auth": "workspace:*",
 		"@manacore/shared-auth-stores": "workspace:*",
@ -52,30 +56,27 @@
 		"@manacore/shared-branding": "workspace:*",
 		"@manacore/shared-config": "workspace:*",
 		"@manacore/shared-error-tracking": "workspace:*",
-		"@manacore/feedback": "workspace:*",
 		"@manacore/shared-i18n": "workspace:*",
-		"@manacore/help": "workspace:*",
 		"@manacore/shared-icons": "workspace:*",
+		"@manacore/shared-links": "workspace:*",
 		"@manacore/shared-profile-ui": "workspace:*",
 		"@manacore/shared-stores": "workspace:*",
 		"@manacore/shared-tags": "workspace:*",
-		"@manacore/subscriptions": "workspace:*",
 		"@manacore/shared-tailwind": "workspace:*",
 		"@manacore/shared-theme": "workspace:*",
 		"@manacore/shared-theme-ui": "workspace:*",
 		"@manacore/shared-types": "workspace:*",
-		"@manacore/shared-uload": "workspace:*",
 		"@manacore/shared-ui": "workspace:*",
+		"@manacore/shared-uload": "workspace:*",
 		"@manacore/shared-utils": "workspace:*",
 		"@manacore/spiral-db": "workspace:*",
+		"@manacore/subscriptions": "workspace:*",
 		"@manacore/wallpaper-generator": "workspace:*",
-		"@calc/shared": "workspace:*",
-		"@clock/shared": "workspace:*",
 		"@zitare/content": "workspace:*",
 		"date-fns": "^4.1.0",
 		"dexie": "^4.0.11",
+		"marked": "^17.0.5",
 		"svelte-dnd-action": "^0.9.68",
-		"leaflet": "^1.9.4",
 		"svelte-i18n": "^4.0.0"
 	},
 	"type": "module"
--- a/apps/manacore/apps/web/src/routes/(app)/llm-test/+page.svelte
+++ b/apps/manacore/apps/web/src/routes/(app)/llm-test/+page.svelte
@ -10,13 +10,40 @@
 		MODELS,
 		type ModelKey,
 	} from '@manacore/local-llm';
-	import { Robot, Trash, PaperPlaneRight } from '@manacore/shared-icons';
+	import { marked } from 'marked';
+	import { Robot, Trash, PaperPlaneRight, ClockCounterClockwise } from '@manacore/shared-icons';

 	const modelKeys = Object.keys(MODELS) as ModelKey[];

+	// --- Markdown rendering ---
+	marked.setOptions({ breaks: true, gfm: true });
+
+	function renderMarkdown(text: string): string {
+		return marked.parse(text, { async: false }) as string;
+	}
+
+	// --- Model cache status ---
+	let modelCacheStatus = $state<Record<string, boolean>>({});
+
+	async function checkModelCache() {
+		if (typeof caches === 'undefined') return;
+		for (const [key, config] of Object.entries(MODELS)) {
+			try {
+				const { hasModelInCache } = await import('@mlc-ai/web-llm');
+				modelCacheStatus[key] = await hasModelInCache(config.modelId);
+			} catch {
+				modelCacheStatus[key] = false;
+			}
+		}
+	}
+
+	if (typeof window !== 'undefined') {
+		checkModelCache();
+	}
+
 	// --- State ---
 	let selectedModel: ModelKey = $state('qwen-2.5-1.5b');
-	let activeTab: 'chat' | 'extract' | 'classify' | 'compare' = $state('chat');
+	let activeTab: 'chat' | 'extract' | 'classify' | 'compare' | 'benchmark' = $state('chat');
 	const supported = isLocalLlmSupported();
 	const status = getLocalLlmStatus();

@ -56,6 +83,16 @@
 		error?: string;
 	}

+	interface CompareHistoryEntry {
+		id: string;
+		timestamp: number;
+		prompt: string;
+		systemPrompt: string;
+		temperature: number;
+		maxTokens: number;
+		results: CompareResult[];
+	}
+
 	let comparePrompt = $state('');
 	let compareSystemPrompt = $state('');
 	let compareTemperature = $state(0.7);
@ -64,6 +101,79 @@
 	let compareRunning = $state(false);
 	let compareCurrentModel = $state<string | null>(null);
 	let compareStreamingContent = $state('');
+	let compareHistory = $state<CompareHistoryEntry[]>([]);
+	let showHistory = $state(false);
+
+	function loadCompareHistory() {
+		try {
+			const stored = localStorage.getItem('llm-compare-history');
+			if (stored) compareHistory = JSON.parse(stored);
+		} catch {
+			/* ignore */
+		}
+	}
+
+	function saveCompareHistory() {
+		try {
+			localStorage.setItem('llm-compare-history', JSON.stringify(compareHistory));
+		} catch {
+			/* ignore */
+		}
+	}
+
+	function deleteHistoryEntry(id: string) {
+		compareHistory = compareHistory.filter((e) => e.id !== id);
+		saveCompareHistory();
+	}
+
+	function restoreHistoryEntry(entry: CompareHistoryEntry) {
+		comparePrompt = entry.prompt;
+		compareSystemPrompt = entry.systemPrompt;
+		compareTemperature = entry.temperature;
+		compareMaxTokens = entry.maxTokens;
+		compareResults = entry.results;
+		showHistory = false;
+	}
+
+	if (typeof window !== 'undefined') {
+		loadCompareHistory();
+	}
+
+	// Benchmark tab
+	interface BenchmarkRun {
+		iteration: number;
+		latencyMs: number;
+		tokPerSec: number;
+		completionTokens: number;
+	}
+
+	interface BenchmarkStats {
+		runs: BenchmarkRun[];
+		avgLatency: number;
+		minLatency: number;
+		maxLatency: number;
+		medianLatency: number;
+		avgTokPerSec: number;
+		minTokPerSec: number;
+		maxTokPerSec: number;
+		medianTokPerSec: number;
+		totalTokens: number;
+	}
+
+	let benchmarkPrompt = $state('');
+	let benchmarkSystemPrompt = $state('');
+	let benchmarkIterations = $state(5);
+	let benchmarkTemperature = $state(0.7);
+	let benchmarkMaxTokens = $state(256);
+	let benchmarkRunning = $state(false);
+	let benchmarkCurrentRun = $state(0);
+	let benchmarkStats = $state<BenchmarkStats | null>(null);
+
+	function median(arr: number[]): number {
+		const sorted = [...arr].sort((a, b) => a - b);
+		const mid = Math.floor(sorted.length / 2);
+		return sorted.length % 2 !== 0 ? sorted[mid] : Math.round((sorted[mid - 1] + sorted[mid]) / 2);
+	}

 	// --- Derived ---
 	let isReady = $derived(status.current.state === 'ready');
@ -95,7 +205,6 @@

 	let modelInfo = $derived(MODELS[selectedModel]);

-	// Auto-scroll chat to bottom on new messages/streaming
 	$effect(() => {
 		messages.length;
 		streamingContent;
@ -107,6 +216,7 @@
 	// --- Actions ---
 	async function handleLoad() {
 		await loadLocalLlm(selectedModel);
+		checkModelCache();
 	}

 	async function handleUnload() {
@ -119,7 +229,6 @@

 	async function handleSend() {
 		if (!userInput.trim() || isGenerating) return;
-
 		const userMsg = userInput.trim();
 		messages = [...messages, { role: 'user', content: userMsg }];
 		userInput = '';
@ -128,12 +237,8 @@

 		try {
 			const msgs: { role: 'system' | 'user' | 'assistant'; content: string }[] = [];
-			if (systemPrompt.trim()) {
-				msgs.push({ role: 'system', content: systemPrompt.trim() });
-			}
-			for (const m of messages) {
-				msgs.push({ role: m.role, content: m.content });
-			}
+			if (systemPrompt.trim()) msgs.push({ role: 'system', content: systemPrompt.trim() });
+			for (const m of messages) msgs.push({ role: m.role, content: m.content });

 			const result = await generate({
 				messages: msgs,
@ -203,18 +308,15 @@
 		compareStreamingContent = '';

 		const msgs: { role: 'system' | 'user'; content: string }[] = [];
-		if (compareSystemPrompt.trim()) {
+		if (compareSystemPrompt.trim())
 			msgs.push({ role: 'system', content: compareSystemPrompt.trim() });
-		}
 		msgs.push({ role: 'user', content: comparePrompt.trim() });

 		for (const modelKey of modelKeys) {
 			compareCurrentModel = MODELS[modelKey].displayName;
 			compareStreamingContent = '';
-
 			try {
 				await loadLocalLlm(modelKey);
-
 				const result = await generate({
 					messages: msgs,
 					temperature: compareTemperature,
@ -223,12 +325,10 @@
 						compareStreamingContent += token;
 					},
 				});
-
 				const tokPerSec =
 					result.latencyMs > 0
 						? Math.round((result.usage.completion_tokens / result.latencyMs) * 1000)
 						: 0;
-
 				compareResults = [
 					...compareResults,
 					{
@ -256,13 +356,81 @@
 					},
 				];
 			}
-
 			await unloadLocalLlm();
 		}

+		const entry: CompareHistoryEntry = {
+			id: crypto.randomUUID(),
+			timestamp: Date.now(),
+			prompt: comparePrompt,
+			systemPrompt: compareSystemPrompt,
+			temperature: compareTemperature,
+			maxTokens: compareMaxTokens,
+			results: compareResults,
+		};
+		compareHistory = [entry, ...compareHistory].slice(0, 20);
+		saveCompareHistory();
+
 		compareCurrentModel = null;
 		compareStreamingContent = '';
 		compareRunning = false;
+		checkModelCache();
+	}
+
+	async function handleBenchmark() {
+		if (!benchmarkPrompt.trim() || benchmarkRunning || !isReady) return;
+		benchmarkRunning = true;
+		benchmarkStats = null;
+		benchmarkCurrentRun = 0;
+
+		const msgs: { role: 'system' | 'user'; content: string }[] = [];
+		if (benchmarkSystemPrompt.trim())
+			msgs.push({ role: 'system', content: benchmarkSystemPrompt.trim() });
+		msgs.push({ role: 'user', content: benchmarkPrompt.trim() });
+
+		const runs: BenchmarkRun[] = [];
+		for (let i = 0; i < benchmarkIterations; i++) {
+			benchmarkCurrentRun = i + 1;
+			try {
+				const result = await generate({
+					messages: msgs,
+					temperature: benchmarkTemperature,
+					maxTokens: benchmarkMaxTokens,
+				});
+				const tokPerSec =
+					result.latencyMs > 0
+						? Math.round((result.usage.completion_tokens / result.latencyMs) * 1000)
+						: 0;
+				runs.push({
+					iteration: i + 1,
+					latencyMs: result.latencyMs,
+					tokPerSec,
+					completionTokens: result.usage.completion_tokens,
+				});
+			} catch {
+				runs.push({ iteration: i + 1, latencyMs: 0, tokPerSec: 0, completionTokens: 0 });
+			}
+		}
+
+		const latencies = runs.map((r) => r.latencyMs).filter((l) => l > 0);
+		const speeds = runs.map((r) => r.tokPerSec).filter((s) => s > 0);
+		benchmarkStats = {
+			runs,
+			avgLatency: latencies.length
+				? Math.round(latencies.reduce((a, b) => a + b, 0) / latencies.length)
+				: 0,
+			minLatency: latencies.length ? Math.min(...latencies) : 0,
+			maxLatency: latencies.length ? Math.max(...latencies) : 0,
+			medianLatency: latencies.length ? median(latencies) : 0,
+			avgTokPerSec: speeds.length
+				? Math.round(speeds.reduce((a, b) => a + b, 0) / speeds.length)
+				: 0,
+			minTokPerSec: speeds.length ? Math.min(...speeds) : 0,
+			maxTokPerSec: speeds.length ? Math.max(...speeds) : 0,
+			medianTokPerSec: speeds.length ? median(speeds) : 0,
+			totalTokens: runs.reduce((a, r) => a + r.completionTokens, 0),
+		};
+		benchmarkRunning = false;
 	}

 	function handleClear() {
@ -278,10 +446,29 @@
 			handleSend();
 		}
 	}
+
+	function formatTime(ts: number): string {
+		return new Date(ts).toLocaleString('de-DE', {
+			day: '2-digit',
+			month: '2-digit',
+			hour: '2-digit',
+			minute: '2-digit',
+		});
+	}
 </script>

 <svelte:head>
 	<title>Local LLM Test - ManaCore</title>
+	{@html `<style>
+		.llm-prose { line-height: 1.6; }
+		.llm-prose p { margin: 0.4em 0; }
+		.llm-prose pre { background: var(--color-muted, #1e1e2e); border-radius: 0.5rem; padding: 0.75rem; overflow-x: auto; margin: 0.5em 0; }
+		.llm-prose code { font-size: 0.85em; background: var(--color-muted, #1e1e2e); padding: 0.15em 0.3em; border-radius: 0.25rem; }
+		.llm-prose pre code { background: none; padding: 0; }
+		.llm-prose ul, .llm-prose ol { padding-left: 1.5em; margin: 0.4em 0; }
+		.llm-prose h1, .llm-prose h2, .llm-prose h3 { margin: 0.6em 0 0.3em; font-weight: 600; }
+		.llm-prose blockquote { border-left: 3px solid var(--color-border, #444); padding-left: 0.75em; margin: 0.4em 0; opacity: 0.8; }
+	</style>`}
 </svelte:head>

 <div class="mx-auto max-w-4xl">
@ -296,12 +483,11 @@
 		<div class="rounded-xl border border-red-500/30 bg-red-500/10 p-6 text-center">
 			<p class="text-lg font-semibold text-red-400">WebGPU nicht verfügbar</p>
 			<p class="mt-2 text-sm text-muted-foreground">
-				Dieses Feature benötigt einen Browser mit WebGPU-Support (Chrome 113+, Edge 113+). Safari
-				und Firefox haben experimentelle Unterstützung.
+				Dieses Feature benötigt einen Browser mit WebGPU-Support (Chrome 113+, Edge 113+).
 			</p>
 		</div>
 	{:else}
-		<!-- Model Controls (hidden on Compare tab — it manages models itself) -->
+		<!-- Model Controls (hidden on Compare tab) -->
 		{#if activeTab !== 'compare'}
 			<div class="mb-6 rounded-xl border border-border bg-card p-4">
 				<div class="flex flex-wrap items-center gap-4">
@ -316,35 +502,39 @@
 							class="rounded-lg border border-border bg-background px-3 py-1.5 text-sm text-foreground disabled:opacity-50"
 						>
 							{#each Object.entries(MODELS) as [key, model]}
-								<option value={key}>{model.displayName}</option>
+								<option value={key}
+									>{model.displayName}{modelCacheStatus[key] ? ' (cached)' : ''}</option
+								>
 							{/each}
 						</select>
 					</div>
-
 					<div class="flex flex-col gap-0.5 text-xs text-muted-foreground">
 						<span>Download: ~{modelInfo.downloadSizeMb} MB</span>
 						<span>RAM: ~{modelInfo.ramUsageMb} MB</span>
+						{#if modelCacheStatus[selectedModel] !== undefined}
+							<span
+								class={modelCacheStatus[selectedModel] ? 'text-green-500' : 'text-muted-foreground'}
+							>
+								{modelCacheStatus[selectedModel] ? 'Im Cache' : 'Nicht im Cache'}
+							</span>
+						{/if}
 					</div>
-
 					<div class="flex items-center gap-2">
 						{#if isReady}
 							<button
 								onclick={handleUnload}
 								class="rounded-lg border border-border px-4 py-1.5 text-sm text-muted-foreground transition-colors hover:bg-muted"
+								>Entladen</button
 							>
-								Entladen
-							</button>
 						{:else}
 							<button
 								onclick={handleLoad}
 								disabled={isLoading}
 								class="rounded-lg bg-primary px-4 py-1.5 text-sm font-medium text-primary-foreground transition-opacity disabled:opacity-50"
+								>{isLoading ? 'Lädt...' : 'Modell laden'}</button
 							>
-								{isLoading ? 'Lädt...' : 'Modell laden'}
-							</button>
 						{/if}
 					</div>
-
 					<div class="ml-auto flex items-center gap-2">
 						<div
 							class="h-2.5 w-2.5 rounded-full {isReady
@ -358,7 +548,6 @@
 						<span class="text-xs text-muted-foreground">{statusText}</span>
 					</div>
 				</div>
-
 				{#if progress !== null}
 					<div class="mt-3 h-2 overflow-hidden rounded-full bg-muted">
 						<div
@ -367,21 +556,39 @@
 						></div>
 					</div>
 				{/if}
+				<!-- Cache overview -->
+				{#if Object.keys(modelCacheStatus).length > 0}
+					<div class="mt-3 flex flex-wrap gap-2 border-t border-border pt-3">
+						{#each modelKeys as key}
+							<div
+								class="flex items-center gap-1.5 rounded-md border border-border px-2.5 py-1 text-xs"
+							>
+								<div
+									class="h-1.5 w-1.5 rounded-full {modelCacheStatus[key]
+										? 'bg-green-500'
+										: 'bg-muted-foreground/30'}"
+								></div>
+								<span class="text-muted-foreground">{MODELS[key].displayName}</span>
+								{#if modelCacheStatus[key]}
+									<span class="text-green-500">~{MODELS[key].downloadSizeMb} MB</span>
+								{/if}
+							</div>
+						{/each}
+					</div>
+				{/if}
 			</div>
 		{/if}

 		<!-- Tabs -->
 		<div class="mb-4 flex gap-1 rounded-lg border border-border bg-card p-1">
-			{#each [{ id: 'chat', label: 'Chat' }, { id: 'extract', label: 'JSON Extract' }, { id: 'classify', label: 'Classify' }, { id: 'compare', label: 'Compare' }] as tab}
+			{#each [{ id: 'chat', label: 'Chat' }, { id: 'extract', label: 'JSON Extract' }, { id: 'classify', label: 'Classify' }, { id: 'compare', label: 'Compare' }, { id: 'benchmark', label: 'Benchmark' }] as tab}
 				<button
 					onclick={() => (activeTab = tab.id as typeof activeTab)}
 					class="flex-1 rounded-md px-3 py-1.5 text-sm font-medium transition-colors {activeTab ===
 					tab.id
 						? 'bg-primary text-primary-foreground'
-						: 'text-muted-foreground hover:text-foreground'}"
+						: 'text-muted-foreground hover:text-foreground'}">{tab.label}</button
 				>
-					{tab.label}
-				</button>
 			{/each}
 		</div>

@ -394,7 +601,6 @@
 					placeholder="System Prompt (optional)..."
 					class="rounded-xl border border-border bg-card px-4 py-2.5 text-sm text-foreground placeholder:text-muted-foreground focus:border-primary focus:outline-none"
 				/>
-
 				<div
 					bind:this={chatContainer}
 					class="max-h-[60vh] min-h-[300px] space-y-3 overflow-y-auto rounded-xl border border-border bg-background/50 p-4"
@ -420,38 +626,41 @@
 								<div class="mb-1 text-xs font-medium text-muted-foreground">
 									{msg.role === 'user' ? 'Du' : modelInfo.displayName}
 								</div>
-								<div class="whitespace-pre-wrap text-sm text-foreground">{msg.content}</div>
+								{#if msg.role === 'assistant'}
+									<div class="llm-prose text-sm text-foreground">
+										{@html renderMarkdown(msg.content)}
+									</div>
+								{:else}
+									<div class="whitespace-pre-wrap text-sm text-foreground">{msg.content}</div>
+								{/if}
 							</div>
 						{/each}
-
 						{#if streamingContent}
 							<div class="mr-8 rounded-lg border border-border bg-card p-3">
 								<div class="mb-1 text-xs font-medium text-muted-foreground">
 									{modelInfo.displayName}
 								</div>
-								<div class="whitespace-pre-wrap text-sm text-foreground">
-									{streamingContent}<span class="animate-pulse">|</span>
+								<div class="llm-prose text-sm text-foreground">
+									{@html renderMarkdown(streamingContent)}<span class="animate-pulse">|</span>
 								</div>
 							</div>
 						{/if}
 					{/if}
 				</div>
-
 				{#if lastLatency !== null}
 					<div class="flex gap-4 text-xs text-muted-foreground">
 						<span>Latenz: {lastLatency}ms</span>
 						{#if lastTokens}
 							<span>Prompt: {lastTokens.prompt} tokens</span>
 							<span>Completion: {lastTokens.completion} tokens</span>
-							<span>
-								Speed: {lastLatency > 0
+							<span
+								>Speed: {lastLatency > 0
 									? Math.round((lastTokens.completion / lastLatency) * 1000)
-									: 0} tok/s
-							</span>
+									: 0} tok/s</span
+							>
 						{/if}
 					</div>
 				{/if}
-
 				<div class="flex gap-3">
 					<textarea
 						bind:value={userInput}
@ -466,15 +675,13 @@
 							onclick={handleSend}
 							disabled={!isReady || !userInput.trim() || isGenerating}
 							class="rounded-xl bg-primary px-4 py-3 text-sm font-medium text-primary-foreground transition-opacity disabled:opacity-50"
+							><PaperPlaneRight size={18} /></button
 						>
-							<PaperPlaneRight size={18} />
-						</button>
 						<button
 							onclick={handleClear}
 							class="rounded-xl border border-border px-4 py-3 text-sm text-muted-foreground transition-colors hover:bg-muted"
+							><Trash size={18} /></button
 						>
-							<Trash size={18} />
-						</button>
 					</div>
 				</div>
 			</div>
@ -485,8 +692,7 @@
 			<div class="flex flex-col gap-4">
 				<div class="rounded-xl border border-border bg-card p-4">
 					<p class="mb-3 text-sm text-muted-foreground">
-						Extrahiere strukturiertes JSON aus beliebigem Text. Das LLM analysiert den Text und gibt
-						ein JSON-Objekt zurück.
+						Extrahiere strukturiertes JSON aus beliebigem Text.
 					</p>
 					<input
 						type="text"
@ -504,11 +710,9 @@
 						onclick={handleExtract}
 						disabled={!isReady || !extractText.trim() || extractLoading}
 						class="mt-3 rounded-lg bg-primary px-4 py-2 text-sm font-medium text-primary-foreground disabled:opacity-50"
+						>{extractLoading ? 'Extrahiere...' : 'JSON extrahieren'}</button
 					>
-						{extractLoading ? 'Extrahiere...' : 'JSON extrahieren'}
-					</button>
 				</div>
-
 				{#if extractResult}
 					<div class="rounded-xl border border-border bg-card p-4">
 						<div class="mb-2 text-xs font-medium text-muted-foreground">Ergebnis</div>
@ -524,8 +728,7 @@
 			<div class="flex flex-col gap-4">
 				<div class="rounded-xl border border-border bg-card p-4">
 					<p class="mb-3 text-sm text-muted-foreground">
-						Klassifiziere Text in eine von mehreren Kategorien. Das LLM wählt die passendste
-						Kategorie.
+						Klassifiziere Text in eine von mehreren Kategorien.
 					</p>
 					<input
 						type="text"
@ -543,11 +746,9 @@
 						onclick={handleClassify}
 						disabled={!isReady || !classifyText.trim() || classifyLoading}
 						class="mt-3 rounded-lg bg-primary px-4 py-2 text-sm font-medium text-primary-foreground disabled:opacity-50"
+						>{classifyLoading ? 'Klassifiziere...' : 'Klassifizieren'}</button
 					>
-						{classifyLoading ? 'Klassifiziere...' : 'Klassifizieren'}
-					</button>
 				</div>
-
 				{#if classifyResult}
 					<div class="rounded-xl border border-border bg-card p-4">
 						<div class="mb-2 text-xs font-medium text-muted-foreground">Ergebnis</div>
@ -563,10 +764,53 @@
 		{#if activeTab === 'compare'}
 			<div class="flex flex-col gap-4">
 				<div class="rounded-xl border border-border bg-card p-4">
-					<p class="mb-3 text-sm text-muted-foreground">
-						Denselben Prompt sequentiell gegen alle {modelKeys.length} Modelle testen. Jedes Modell wird
-						geladen, inferiert und wieder entladen — die Ergebnisse erscheinen nebeneinander.
-					</p>
+					<div class="mb-3 flex items-center justify-between">
+						<p class="text-sm text-muted-foreground">
+							Denselben Prompt gegen alle {modelKeys.length} Modelle testen.
+						</p>
+						{#if compareHistory.length > 0}
+							<button
+								onclick={() => (showHistory = !showHistory)}
+								class="flex items-center gap-1.5 rounded-lg border border-border px-3 py-1 text-xs text-muted-foreground transition-colors hover:bg-muted"
+							>
+								<ClockCounterClockwise size={14} />
+								History ({compareHistory.length})
+							</button>
+						{/if}
+					</div>
+
+					{#if showHistory}
+						<div
+							class="mb-4 max-h-64 overflow-y-auto rounded-lg border border-border bg-background p-3"
+						>
+							{#each compareHistory as entry}
+								<div
+									class="flex items-center justify-between border-b border-border py-2 last:border-0"
+								>
+									<div class="min-w-0 flex-1">
+										<div class="truncate text-sm text-foreground">{entry.prompt}</div>
+										<div class="flex gap-3 text-xs text-muted-foreground">
+											<span>{formatTime(entry.timestamp)}</span>
+											<span>{entry.results.length} Modelle</span>
+											<span>T={entry.temperature}</span>
+										</div>
+									</div>
+									<div class="flex gap-1.5">
+										<button
+											onclick={() => restoreHistoryEntry(entry)}
+											class="rounded px-2.5 py-1 text-xs text-primary hover:bg-primary/10"
+											>Laden</button
+										>
+										<button
+											onclick={() => deleteHistoryEntry(entry.id)}
+											class="rounded px-2.5 py-1 text-xs text-red-400 hover:bg-red-500/10"
+											>&times;</button
+										>
+									</div>
+								</div>
+							{/each}
+						</div>
+					{/if}

 					<input
 						type="text"
@ -574,15 +818,13 @@
 						placeholder="System Prompt (optional)..."
 						class="mb-3 w-full rounded-lg border border-border bg-background px-3 py-2 text-sm text-foreground placeholder:text-muted-foreground focus:border-primary focus:outline-none"
 					/>
-
 					<textarea
 						bind:value={comparePrompt}
-						placeholder="Prompt eingeben, der gegen alle Modelle getestet wird...&#10;&#10;z.B.: Erkläre Quantencomputing in 3 Sätzen."
+						placeholder="Prompt eingeben, der gegen alle Modelle getestet wird..."
 						rows={4}
 						disabled={compareRunning}
 						class="w-full resize-none rounded-lg border border-border bg-background px-3 py-2 text-sm text-foreground placeholder:text-muted-foreground focus:border-primary focus:outline-none disabled:opacity-50"
 					></textarea>
-
 					<div class="mt-3 flex flex-wrap items-end gap-4">
 						<div class="flex flex-col gap-1">
 							<label for="compare-temp" class="text-xs text-muted-foreground">Temperature</label>
@ -614,21 +856,21 @@
 							onclick={handleCompare}
 							disabled={!comparePrompt.trim() || compareRunning}
 							class="rounded-lg bg-primary px-5 py-2 text-sm font-medium text-primary-foreground disabled:opacity-50"
+							>{compareRunning
+								? 'Läuft...'
+								: `Alle ${modelKeys.length} Modelle vergleichen`}</button
 						>
-							{compareRunning ? 'Läuft...' : `Alle ${modelKeys.length} Modelle vergleichen`}
-						</button>
 					</div>
 				</div>

-				<!-- Running indicator -->
 				{#if compareRunning && compareCurrentModel}
 					<div class="rounded-xl border border-yellow-500/30 bg-yellow-500/5 p-4">
 						<div class="mb-2 flex items-center gap-2">
 							<div class="h-2.5 w-2.5 animate-pulse rounded-full bg-yellow-500"></div>
 							<span class="text-sm font-medium text-foreground">{compareCurrentModel}</span>
-							<span class="text-xs text-muted-foreground">
-								({compareResults.length + 1}/{modelKeys.length})
-							</span>
+							<span class="text-xs text-muted-foreground"
+								>({compareResults.length + 1}/{modelKeys.length})</span
+							>
 						</div>
 						{#if compareStreamingContent}
 							<div
@ -642,9 +884,7 @@
 					</div>
 				{/if}

-				<!-- Results -->
 				{#if compareResults.length > 0}
-					<!-- Stats comparison table -->
 					<div class="overflow-x-auto rounded-xl border border-border">
 						<table class="w-full text-sm">
 							<thead>
@ -652,51 +892,42 @@
 									<th class="px-4 py-2.5 text-left text-xs font-medium text-muted-foreground"
 										>Metrik</th
 									>
-									{#each compareResults as r}
-										<th class="px-4 py-2.5 text-left text-xs font-medium text-muted-foreground">
-											{r.displayName}
-										</th>
-									{/each}
+									{#each compareResults as r}<th
+											class="px-4 py-2.5 text-left text-xs font-medium text-muted-foreground"
+											>{r.displayName}</th
+										>{/each}
 								</tr>
 							</thead>
 							<tbody>
 								<tr class="border-b border-border">
 									<td class="px-4 py-2 text-muted-foreground">Latenz</td>
-									{#each compareResults as r}
-										<td class="px-4 py-2 font-mono {r.error ? 'text-red-400' : 'text-foreground'}">
-											{r.error ? 'Fehler' : `${(r.latencyMs / 1000).toFixed(1)}s`}
-										</td>
-									{/each}
+									{#each compareResults as r}<td
+											class="px-4 py-2 font-mono {r.error ? 'text-red-400' : 'text-foreground'}"
+											>{r.error ? 'Fehler' : `${(r.latencyMs / 1000).toFixed(1)}s`}</td
+										>{/each}
 								</tr>
 								<tr class="border-b border-border">
 									<td class="px-4 py-2 text-muted-foreground">Speed</td>
-									{#each compareResults as r}
-										<td class="px-4 py-2 font-mono text-foreground">
-											{r.error ? '—' : `${r.tokPerSec} tok/s`}
-										</td>
-									{/each}
+									{#each compareResults as r}<td class="px-4 py-2 font-mono text-foreground"
+											>{r.error ? '—' : `${r.tokPerSec} tok/s`}</td
+										>{/each}
 								</tr>
 								<tr class="border-b border-border">
 									<td class="px-4 py-2 text-muted-foreground">Prompt Tokens</td>
-									{#each compareResults as r}
-										<td class="px-4 py-2 font-mono text-foreground">
-											{r.error ? '—' : r.promptTokens}
-										</td>
-									{/each}
+									{#each compareResults as r}<td class="px-4 py-2 font-mono text-foreground"
+											>{r.error ? '—' : r.promptTokens}</td
+										>{/each}
 								</tr>
 								<tr>
 									<td class="px-4 py-2 text-muted-foreground">Completion Tokens</td>
-									{#each compareResults as r}
-										<td class="px-4 py-2 font-mono text-foreground">
-											{r.error ? '—' : r.completionTokens}
-										</td>
-									{/each}
+									{#each compareResults as r}<td class="px-4 py-2 font-mono text-foreground"
+											>{r.error ? '—' : r.completionTokens}</td
+										>{/each}
 								</tr>
 							</tbody>
 						</table>
 					</div>

-					<!-- Outputs side by side -->
 					<div
 						class="grid gap-4"
 						style="grid-template-columns: repeat({compareResults.length}, minmax(0, 1fr));"
@ -710,14 +941,10 @@
 									>
 								</div>
 								{#if r.error}
-									<div class="rounded-lg bg-red-500/10 p-3 text-sm text-red-400">
-										{r.error}
-									</div>
+									<div class="rounded-lg bg-red-500/10 p-3 text-sm text-red-400">{r.error}</div>
 								{:else}
-									<div
-										class="max-h-[50vh] overflow-y-auto whitespace-pre-wrap text-sm text-foreground"
-									>
-										{r.content}
+									<div class="llm-prose max-h-[50vh] overflow-y-auto text-sm text-foreground">
+										{@html renderMarkdown(r.content)}
 									</div>
 								{/if}
 							</div>
@ -726,5 +953,186 @@
 				{/if}
 			</div>
 		{/if}
+
+		<!-- Benchmark Tab -->
+		{#if activeTab === 'benchmark'}
+			<div class="flex flex-col gap-4">
+				<div class="rounded-xl border border-border bg-card p-4">
+					<p class="mb-3 text-sm text-muted-foreground">
+						Denselben Prompt N-mal gegen das geladene Modell laufen lassen, um Varianz zu messen.
+					</p>
+
+					{#if !isReady}
+						<div class="rounded-lg bg-yellow-500/10 p-3 text-sm text-yellow-400">
+							Lade zuerst ein Modell im Model-Controls-Bereich oben.
+						</div>
+					{:else}
+						<input
+							type="text"
+							bind:value={benchmarkSystemPrompt}
+							placeholder="System Prompt (optional)..."
+							class="mb-3 w-full rounded-lg border border-border bg-background px-3 py-2 text-sm text-foreground placeholder:text-muted-foreground focus:border-primary focus:outline-none"
+						/>
+						<textarea
+							bind:value={benchmarkPrompt}
+							placeholder="Prompt für den Benchmark...&#10;&#10;z.B.: Zähle von 1 bis 20."
+							rows={3}
+							disabled={benchmarkRunning}
+							class="w-full resize-none rounded-lg border border-border bg-background px-3 py-2 text-sm text-foreground placeholder:text-muted-foreground focus:border-primary focus:outline-none disabled:opacity-50"
+						></textarea>
+						<div class="mt-3 flex flex-wrap items-end gap-4">
+							<div class="flex flex-col gap-1">
+								<label for="bench-iters" class="text-xs text-muted-foreground">Iterationen</label>
+								<input
+									id="bench-iters"
+									type="number"
+									min="1"
+									max="50"
+									bind:value={benchmarkIterations}
+									disabled={benchmarkRunning}
+									class="w-24 rounded-lg border border-border bg-background px-3 py-1.5 text-sm text-foreground disabled:opacity-50"
+								/>
+							</div>
+							<div class="flex flex-col gap-1">
+								<label for="bench-temp" class="text-xs text-muted-foreground">Temperature</label>
+								<input
+									id="bench-temp"
+									type="number"
+									min="0"
+									max="2"
+									step="0.1"
+									bind:value={benchmarkTemperature}
+									disabled={benchmarkRunning}
+									class="w-24 rounded-lg border border-border bg-background px-3 py-1.5 text-sm text-foreground disabled:opacity-50"
+								/>
+							</div>
+							<div class="flex flex-col gap-1">
+								<label for="bench-tokens" class="text-xs text-muted-foreground">Max Tokens</label>
+								<input
+									id="bench-tokens"
+									type="number"
+									min="16"
+									max="2048"
+									step="16"
+									bind:value={benchmarkMaxTokens}
+									disabled={benchmarkRunning}
+									class="w-24 rounded-lg border border-border bg-background px-3 py-1.5 text-sm text-foreground disabled:opacity-50"
+								/>
+							</div>
+							<button
+								onclick={handleBenchmark}
+								disabled={!benchmarkPrompt.trim() || benchmarkRunning}
+								class="rounded-lg bg-primary px-5 py-2 text-sm font-medium text-primary-foreground disabled:opacity-50"
+							>
+								{#if benchmarkRunning}Run {benchmarkCurrentRun}/{benchmarkIterations}...{:else}Benchmark
+									starten{/if}
+							</button>
+						</div>
+					{/if}
+				</div>
+
+				{#if benchmarkRunning}
+					<div class="rounded-xl border border-yellow-500/30 bg-yellow-500/5 p-4">
+						<div class="mb-2 flex items-center gap-2">
+							<div class="h-2.5 w-2.5 animate-pulse rounded-full bg-yellow-500"></div>
+							<span class="text-sm font-medium text-foreground"
+								>{modelInfo.displayName} — Run {benchmarkCurrentRun}/{benchmarkIterations}</span
+							>
+						</div>
+						<div class="h-2 overflow-hidden rounded-full bg-muted">
+							<div
+								class="h-full rounded-full bg-primary transition-all duration-300"
+								style="width: {Math.round((benchmarkCurrentRun / benchmarkIterations) * 100)}%"
+							></div>
+						</div>
+					</div>
+				{/if}
+
+				{#if benchmarkStats}
+					<div class="grid grid-cols-2 gap-4">
+						<div class="rounded-xl border border-border bg-card p-4">
+							<div class="mb-2 text-xs font-medium text-muted-foreground">Latenz (ms)</div>
+							<div class="grid grid-cols-2 gap-y-2 text-sm">
+								<span class="text-muted-foreground">Durchschnitt</span><span
+									class="font-mono text-foreground">{benchmarkStats.avgLatency}</span
+								>
+								<span class="text-muted-foreground">Median</span><span
+									class="font-mono text-foreground">{benchmarkStats.medianLatency}</span
+								>
+								<span class="text-muted-foreground">Min</span><span class="font-mono text-green-500"
+									>{benchmarkStats.minLatency}</span
+								>
+								<span class="text-muted-foreground">Max</span><span class="font-mono text-red-400"
+									>{benchmarkStats.maxLatency}</span
+								>
+							</div>
+						</div>
+						<div class="rounded-xl border border-border bg-card p-4">
+							<div class="mb-2 text-xs font-medium text-muted-foreground">Speed (tok/s)</div>
+							<div class="grid grid-cols-2 gap-y-2 text-sm">
+								<span class="text-muted-foreground">Durchschnitt</span><span
+									class="font-mono text-foreground">{benchmarkStats.avgTokPerSec}</span
+								>
+								<span class="text-muted-foreground">Median</span><span
+									class="font-mono text-foreground">{benchmarkStats.medianTokPerSec}</span
+								>
+								<span class="text-muted-foreground">Max</span><span class="font-mono text-green-500"
+									>{benchmarkStats.maxTokPerSec}</span
+								>
+								<span class="text-muted-foreground">Min</span><span class="font-mono text-red-400"
+									>{benchmarkStats.minTokPerSec}</span
+								>
+							</div>
+						</div>
+					</div>
+
+					<div class="overflow-x-auto rounded-xl border border-border">
+						<table class="w-full text-sm">
+							<thead>
+								<tr class="border-b border-border bg-card">
+									<th class="px-4 py-2 text-left text-xs font-medium text-muted-foreground">Run</th>
+									<th class="px-4 py-2 text-left text-xs font-medium text-muted-foreground"
+										>Latenz</th
+									>
+									<th class="px-4 py-2 text-left text-xs font-medium text-muted-foreground"
+										>tok/s</th
+									>
+									<th class="px-4 py-2 text-left text-xs font-medium text-muted-foreground"
+										>Tokens</th
+									>
+									<th class="px-4 py-2 text-left text-xs font-medium text-muted-foreground"
+										>Verteilung</th
+									>
+								</tr>
+							</thead>
+							<tbody>
+								{#each benchmarkStats.runs as run}
+									{@const maxLat = benchmarkStats.maxLatency || 1}
+									<tr class="border-b border-border last:border-0">
+										<td class="px-4 py-2 font-mono text-muted-foreground">#{run.iteration}</td>
+										<td class="px-4 py-2 font-mono text-foreground"
+											>{(run.latencyMs / 1000).toFixed(2)}s</td
+										>
+										<td class="px-4 py-2 font-mono text-foreground">{run.tokPerSec}</td>
+										<td class="px-4 py-2 font-mono text-foreground">{run.completionTokens}</td>
+										<td class="px-4 py-2"
+											><div class="h-3 w-full rounded-full bg-muted">
+												<div
+													class="h-full rounded-full bg-primary/60"
+													style="width: {Math.round((run.latencyMs / maxLat) * 100)}%"
+												></div>
+											</div></td
+										>
+									</tr>
+								{/each}
+							</tbody>
+						</table>
+					</div>
+					<div class="text-xs text-muted-foreground">
+						Total: {benchmarkStats.totalTokens} tokens in {benchmarkStats.runs.length} runs ({modelInfo.displayName})
+					</div>
+				{/if}
+			</div>
+		{/if}
 	{/if}
 </div>