feat(local-llm): add client-side LLM inference package with WebLLM

New shared package for browser-based LLM inference using Qwen 2.5 1.5B via WebLLM. Includes Svelte 5 reactive stores, engine management, and type definitions for local AI features without server roundtrips. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 22:41:09 +02:00 · 2026-04-02 01:53:54 +02:00 · 2026-04-02 01:53:54 +02:00 · ef538245d1
commit ef538245d1
parent 4116715db0
7 changed files with 501 additions and 0 deletions
--- a/packages/local-llm/package.json
+++ b/packages/local-llm/package.json
@ -0,0 +1,26 @@
+{
+	"name": "@manacore/local-llm",
+	"version": "0.1.0",
+	"private": true,
+	"description": "Client-side LLM inference via WebLLM (Qwen 2.5 1.5B) with Svelte 5 reactive stores",
+	"main": "./src/index.ts",
+	"types": "./src/index.ts",
+	"exports": {
+		".": "./src/index.ts"
+	},
+	"scripts": {
+		"type-check": "tsc --noEmit",
+		"clean": "rm -rf dist"
+	},
+	"dependencies": {
+		"@mlc-ai/web-llm": "^0.2.78"
+	},
+	"devDependencies": {
+		"@types/node": "^24.10.1",
+		"svelte": "^5.0.0",
+		"typescript": "^5.9.3"
+	},
+	"peerDependencies": {
+		"svelte": "^5.0.0"
+	}
+}
--- a/packages/local-llm/src/engine.ts
+++ b/packages/local-llm/src/engine.ts
@ -0,0 +1,266 @@
+/**
+ * LocalLLMEngine — WebLLM wrapper for client-side inference.
+ *
+ * Lazy-loads the model on first use, caches weights in browser Cache API.
+ * Provides both one-shot and streaming generation.
+ */
+
+import type { MLCEngine } from '@mlc-ai/web-llm';
+import type { ChatMessage, GenerateOptions, GenerateResult, LoadingStatus } from './types';
+import type { ModelConfig } from './types';
+import { MODELS, DEFAULT_MODEL, type ModelKey } from './models';
+
+export class LocalLLMEngine {
+	private engine: MLCEngine | null = null;
+	private loadPromise: Promise<void> | null = null;
+	private currentModel: ModelKey | null = null;
+	private _status: LoadingStatus = { state: 'idle' };
+	private statusListeners: Set<(status: LoadingStatus) => void> = new Set();
+
+	get status(): LoadingStatus {
+		return this._status;
+	}
+
+	get isReady(): boolean {
+		return this._status.state === 'ready';
+	}
+
+	get modelConfig(): ModelConfig | null {
+		return this.currentModel ? MODELS[this.currentModel] : null;
+	}
+
+	/**
+	 * Subscribe to status changes (for non-Svelte usage).
+	 */
+	onStatusChange(listener: (status: LoadingStatus) => void): () => void {
+		this.statusListeners.add(listener);
+		return () => this.statusListeners.delete(listener);
+	}
+
+	private setStatus(status: LoadingStatus) {
+		this._status = status;
+		for (const listener of this.statusListeners) {
+			listener(status);
+		}
+	}
+
+	/**
+	 * Check if WebGPU is available in this browser.
+	 */
+	static isSupported(): boolean {
+		return typeof navigator !== 'undefined' && 'gpu' in navigator;
+	}
+
+	/**
+	 * Load a model. Idempotent — returns immediately if already loaded.
+	 * Model weights are cached in browser Cache API for instant reload.
+	 */
+	async load(model: ModelKey = DEFAULT_MODEL): Promise<void> {
+		// Already loaded with this model
+		if (this.engine && this.currentModel === model) return;
+
+		// Already loading
+		if (this.loadPromise && this.currentModel === model) return this.loadPromise;
+
+		// Unload previous model if switching
+		if (this.engine && this.currentModel !== model) {
+			await this.unload();
+		}
+
+		this.currentModel = model;
+		this.loadPromise = this._load(model);
+		return this.loadPromise;
+	}
+
+	private async _load(model: ModelKey): Promise<void> {
+		if (!LocalLLMEngine.isSupported()) {
+			this.setStatus({ state: 'error', error: 'WebGPU not supported in this browser' });
+			throw new Error('WebGPU not supported');
+		}
+
+		this.setStatus({ state: 'checking' });
+
+		try {
+			const { CreateMLCEngine } = await import('@mlc-ai/web-llm');
+			const config = MODELS[model];
+
+			this.engine = await CreateMLCEngine(config.modelId, {
+				initProgressCallback: (report) => {
+					if (report.progress < 1) {
+						this.setStatus({
+							state: 'downloading',
+							progress: report.progress,
+							text: report.text,
+						});
+					} else {
+						this.setStatus({ state: 'loading', text: 'Initializing model...' });
+					}
+				},
+			});
+
+			this.setStatus({ state: 'ready' });
+		} catch (err) {
+			const message = err instanceof Error ? err.message : String(err);
+			this.setStatus({ state: 'error', error: message });
+			this.loadPromise = null;
+			throw err;
+		}
+	}
+
+	/**
+	 * Unload the model and free memory.
+	 */
+	async unload(): Promise<void> {
+		if (this.engine) {
+			await this.engine.unload();
+			this.engine = null;
+		}
+		this.currentModel = null;
+		this.loadPromise = null;
+		this.setStatus({ state: 'idle' });
+	}
+
+	/**
+	 * Generate a response. Auto-loads the model if not yet loaded.
+	 */
+	async generate(options: GenerateOptions): Promise<GenerateResult> {
+		if (!this.engine) {
+			await this.load();
+		}
+
+		const start = performance.now();
+
+		if (options.onToken) {
+			return this._generateStreaming(options, start);
+		}
+
+		const response = await this.engine!.chat.completions.create({
+			messages: options.messages,
+			temperature: options.temperature ?? 0.7,
+			max_tokens: options.maxTokens ?? 1024,
+			stream: false,
+		});
+
+		const choice = response.choices[0];
+		return {
+			content: choice.message.content ?? '',
+			usage: {
+				prompt_tokens: response.usage?.prompt_tokens ?? 0,
+				completion_tokens: response.usage?.completion_tokens ?? 0,
+				total_tokens: response.usage?.total_tokens ?? 0,
+			},
+			latencyMs: Math.round(performance.now() - start),
+		};
+	}
+
+	private async _generateStreaming(
+		options: GenerateOptions,
+		start: number
+	): Promise<GenerateResult> {
+		const chunks: string[] = [];
+		let usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
+
+		const stream = await this.engine!.chat.completions.create({
+			messages: options.messages,
+			temperature: options.temperature ?? 0.7,
+			max_tokens: options.maxTokens ?? 1024,
+			stream: true,
+			stream_options: { include_usage: true },
+		});
+
+		for await (const chunk of stream) {
+			const delta = chunk.choices[0]?.delta?.content;
+			if (delta) {
+				chunks.push(delta);
+				options.onToken!(delta);
+			}
+			if (chunk.usage) {
+				usage = {
+					prompt_tokens: chunk.usage.prompt_tokens,
+					completion_tokens: chunk.usage.completion_tokens,
+					total_tokens: chunk.usage.total_tokens,
+				};
+			}
+		}
+
+		return {
+			content: chunks.join(''),
+			usage,
+			latencyMs: Math.round(performance.now() - start),
+		};
+	}
+
+	/**
+	 * Convenience: single prompt → response.
+	 */
+	async prompt(
+		text: string,
+		opts?: { systemPrompt?: string; temperature?: number; maxTokens?: number }
+	): Promise<string> {
+		const messages: ChatMessage[] = [];
+		if (opts?.systemPrompt) {
+			messages.push({ role: 'system', content: opts.systemPrompt });
+		}
+		messages.push({ role: 'user', content: text });
+
+		const result = await this.generate({
+			messages,
+			temperature: opts?.temperature,
+			maxTokens: opts?.maxTokens,
+		});
+		return result.content;
+	}
+
+	/**
+	 * Convenience: extract structured JSON from text.
+	 */
+	async extractJson<T = unknown>(
+		text: string,
+		instruction: string,
+		opts?: { temperature?: number }
+	): Promise<T> {
+		const result = await this.generate({
+			messages: [
+				{
+					role: 'system',
+					content:
+						'You are a JSON extraction assistant. Always respond with valid JSON only, no markdown, no explanation.',
+				},
+				{
+					role: 'user',
+					content: `${instruction}\n\nText:\n${text}`,
+				},
+			],
+			temperature: opts?.temperature ?? 0.1,
+			maxTokens: 2048,
+		});
+
+		return JSON.parse(result.content) as T;
+	}
+
+	/**
+	 * Convenience: classify text into categories.
+	 */
+	async classify(text: string, categories: string[], opts?: { context?: string }): Promise<string> {
+		const categoryList = categories.map((c) => `"${c}"`).join(', ');
+		const result = await this.generate({
+			messages: [
+				{
+					role: 'system',
+					content: `Classify the text into exactly one of these categories: ${categoryList}. Respond with only the category name, nothing else.${opts?.context ? ` Context: ${opts.context}` : ''}`,
+				},
+				{ role: 'user', content: text },
+			],
+			temperature: 0,
+			maxTokens: 50,
+		});
+
+		const normalized = result.content.trim().replace(/^["']|["']$/g, '');
+		// Return the closest matching category
+		const match = categories.find((c) => c.toLowerCase() === normalized.toLowerCase());
+		return match ?? normalized;
+	}
+}
+
+/** Singleton instance for app-wide use */
+export const localLLM = new LocalLLMEngine();
--- a/packages/local-llm/src/index.ts
+++ b/packages/local-llm/src/index.ts
@ -0,0 +1,27 @@
+// Engine
+export { LocalLLMEngine, localLLM } from './engine';
+
+// Models
+export { MODELS, DEFAULT_MODEL } from './models';
+export type { ModelKey } from './models';
+
+// Types
+export type {
+	ChatMessage,
+	GenerateOptions,
+	GenerateResult,
+	ModelConfig,
+	LoadingStatus,
+} from './types';
+
+// Svelte 5 reactive helpers
+export {
+	getLocalLlmStatus,
+	loadLocalLlm,
+	unloadLocalLlm,
+	isLocalLlmSupported,
+	generate,
+	generateText,
+	extractJson,
+	classify,
+} from './svelte.svelte';
--- a/packages/local-llm/src/models.ts
+++ b/packages/local-llm/src/models.ts
@ -0,0 +1,27 @@
+import type { ModelConfig } from './types';
+
+/**
+ * Pre-configured models for client-side inference.
+ * All models are quantized for browser use via WebLLM/MLC.
+ */
+
+export const MODELS = {
+	/** Default model — fast, good at structured output, multilingual */
+	'qwen-2.5-1.5b': {
+		modelId: 'Qwen2.5-1.5B-Instruct-q4f16_1-MLC',
+		displayName: 'Qwen 2.5 1.5B',
+		downloadSizeMb: 1000,
+		ramUsageMb: 1800,
+	},
+	/** Smaller variant for low-end devices */
+	'qwen-2.5-0.5b': {
+		modelId: 'Qwen2.5-0.5B-Instruct-q4f16_1-MLC',
+		displayName: 'Qwen 2.5 0.5B',
+		downloadSizeMb: 400,
+		ramUsageMb: 800,
+	},
+} as const satisfies Record<string, ModelConfig>;
+
+export type ModelKey = keyof typeof MODELS;
+
+export const DEFAULT_MODEL: ModelKey = 'qwen-2.5-1.5b';
--- a/packages/local-llm/src/svelte.svelte.ts
+++ b/packages/local-llm/src/svelte.svelte.ts
@ -0,0 +1,87 @@
+/**
+ * Svelte 5 reactive integration for LocalLLMEngine.
+ *
+ * Usage in a Svelte component:
+ *   import { getLocalLlmStatus, loadLocalLlm, generateText } from '@manacore/local-llm';
+ *
+ *   const status = getLocalLlmStatus();
+ *   loadLocalLlm();
+ *   // use status.current reactively
+ */
+
+import { LocalLLMEngine, localLLM } from './engine';
+import type { LoadingStatus, GenerateOptions, GenerateResult } from './types';
+import type { ModelKey } from './models';
+
+/**
+ * Reactive status using Svelte 5 $state rune.
+ */
+let _status = $state<LoadingStatus>({ state: 'idle' });
+
+localLLM.onStatusChange((s) => {
+	_status = s;
+});
+
+export function getLocalLlmStatus(): { readonly current: LoadingStatus } {
+	return {
+		get current() {
+			return _status;
+		},
+	};
+}
+
+/**
+ * Load the model. Safe to call multiple times.
+ */
+export async function loadLocalLlm(model?: ModelKey): Promise<void> {
+	return localLLM.load(model);
+}
+
+/**
+ * Unload the model and free memory.
+ */
+export async function unloadLocalLlm(): Promise<void> {
+	return localLLM.unload();
+}
+
+/**
+ * Check if WebGPU is available.
+ */
+export function isLocalLlmSupported(): boolean {
+	return LocalLLMEngine.isSupported();
+}
+
+/**
+ * Generate with full options (messages, streaming, etc.)
+ */
+export async function generate(options: GenerateOptions): Promise<GenerateResult> {
+	return localLLM.generate(options);
+}
+
+/**
+ * Quick text generation from a single prompt.
+ */
+export async function generateText(
+	prompt: string,
+	opts?: { systemPrompt?: string; temperature?: number; maxTokens?: number }
+): Promise<string> {
+	return localLLM.prompt(prompt, opts);
+}
+
+/**
+ * Extract structured JSON from text.
+ */
+export async function extractJson<T = unknown>(text: string, instruction: string): Promise<T> {
+	return localLLM.extractJson<T>(text, instruction);
+}
+
+/**
+ * Classify text into one of the given categories.
+ */
+export async function classify(
+	text: string,
+	categories: string[],
+	opts?: { context?: string }
+): Promise<string> {
+	return localLLM.classify(text, categories, opts);
+}
--- a/packages/local-llm/src/types.ts
+++ b/packages/local-llm/src/types.ts
@ -0,0 +1,54 @@
+/**
+ * Types for client-side LLM inference.
+ * Aligned with @manacore/shared-llm ChatMessage/ChatResult where possible.
+ */
+
+export interface ChatMessage {
+	role: 'system' | 'user' | 'assistant';
+	content: string;
+}
+
+export interface GenerateOptions {
+	/** Messages to send */
+	messages: ChatMessage[];
+	/** Sampling temperature 0.0-2.0 (default: 0.7) */
+	temperature?: number;
+	/** Max tokens to generate (default: 1024) */
+	maxTokens?: number;
+	/** Callback for each generated token (streaming) */
+	onToken?: (token: string) => void;
+}
+
+export interface GenerateResult {
+	/** Generated text */
+	content: string;
+	/** Token usage */
+	usage: {
+		prompt_tokens: number;
+		completion_tokens: number;
+		total_tokens: number;
+	};
+	/** Generation time in ms */
+	latencyMs: number;
+}
+
+export interface ModelConfig {
+	/** WebLLM model identifier */
+	modelId: string;
+	/** Human-readable name */
+	displayName: string;
+	/** Approximate download size in MB */
+	downloadSizeMb: number;
+	/** Approximate VRAM/RAM usage in MB */
+	ramUsageMb: number;
+	/** Default system prompt */
+	defaultSystemPrompt?: string;
+}
+
+export type LoadingStatus =
+	| { state: 'idle' }
+	| { state: 'checking' }
+	| { state: 'downloading'; progress: number; text: string }
+	| { state: 'loading'; text: string }
+	| { state: 'ready' }
+	| { state: 'error'; error: string };
--- a/packages/local-llm/tsconfig.json
+++ b/packages/local-llm/tsconfig.json
@ -0,0 +1,14 @@
+{
+	"compilerOptions": {
+		"target": "ES2022",
+		"module": "ESNext",
+		"moduleResolution": "bundler",
+		"lib": ["ES2022", "DOM"],
+		"strict": true,
+		"noEmit": true,
+		"skipLibCheck": true,
+		"forceConsistentCasingInFileNames": true
+	},
+	"include": ["src/**/*"],
+	"exclude": ["node_modules"]
+}