From ef538245d11c2d50dd6d1f75484cc8c78f2c6f19 Mon Sep 17 00:00:00 2001 From: Till JS Date: Thu, 2 Apr 2026 01:53:54 +0200 Subject: [PATCH] feat(local-llm): add client-side LLM inference package with WebLLM New shared package for browser-based LLM inference using Qwen 2.5 1.5B via WebLLM. Includes Svelte 5 reactive stores, engine management, and type definitions for local AI features without server roundtrips. Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/local-llm/package.json | 26 +++ packages/local-llm/src/engine.ts | 266 ++++++++++++++++++++++++ packages/local-llm/src/index.ts | 27 +++ packages/local-llm/src/models.ts | 27 +++ packages/local-llm/src/svelte.svelte.ts | 87 ++++++++ packages/local-llm/src/types.ts | 54 +++++ packages/local-llm/tsconfig.json | 14 ++ 7 files changed, 501 insertions(+) create mode 100644 packages/local-llm/package.json create mode 100644 packages/local-llm/src/engine.ts create mode 100644 packages/local-llm/src/index.ts create mode 100644 packages/local-llm/src/models.ts create mode 100644 packages/local-llm/src/svelte.svelte.ts create mode 100644 packages/local-llm/src/types.ts create mode 100644 packages/local-llm/tsconfig.json diff --git a/packages/local-llm/package.json b/packages/local-llm/package.json new file mode 100644 index 000000000..9a54f1d59 --- /dev/null +++ b/packages/local-llm/package.json @@ -0,0 +1,26 @@ +{ + "name": "@manacore/local-llm", + "version": "0.1.0", + "private": true, + "description": "Client-side LLM inference via WebLLM (Qwen 2.5 1.5B) with Svelte 5 reactive stores", + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "scripts": { + "type-check": "tsc --noEmit", + "clean": "rm -rf dist" + }, + "dependencies": { + "@mlc-ai/web-llm": "^0.2.78" + }, + "devDependencies": { + "@types/node": "^24.10.1", + "svelte": "^5.0.0", + "typescript": "^5.9.3" + }, + "peerDependencies": { + "svelte": "^5.0.0" + } +} diff --git a/packages/local-llm/src/engine.ts b/packages/local-llm/src/engine.ts new file mode 100644 index 000000000..ce7c7b66a --- /dev/null +++ b/packages/local-llm/src/engine.ts @@ -0,0 +1,266 @@ +/** + * LocalLLMEngine — WebLLM wrapper for client-side inference. + * + * Lazy-loads the model on first use, caches weights in browser Cache API. + * Provides both one-shot and streaming generation. + */ + +import type { MLCEngine } from '@mlc-ai/web-llm'; +import type { ChatMessage, GenerateOptions, GenerateResult, LoadingStatus } from './types'; +import type { ModelConfig } from './types'; +import { MODELS, DEFAULT_MODEL, type ModelKey } from './models'; + +export class LocalLLMEngine { + private engine: MLCEngine | null = null; + private loadPromise: Promise | null = null; + private currentModel: ModelKey | null = null; + private _status: LoadingStatus = { state: 'idle' }; + private statusListeners: Set<(status: LoadingStatus) => void> = new Set(); + + get status(): LoadingStatus { + return this._status; + } + + get isReady(): boolean { + return this._status.state === 'ready'; + } + + get modelConfig(): ModelConfig | null { + return this.currentModel ? MODELS[this.currentModel] : null; + } + + /** + * Subscribe to status changes (for non-Svelte usage). + */ + onStatusChange(listener: (status: LoadingStatus) => void): () => void { + this.statusListeners.add(listener); + return () => this.statusListeners.delete(listener); + } + + private setStatus(status: LoadingStatus) { + this._status = status; + for (const listener of this.statusListeners) { + listener(status); + } + } + + /** + * Check if WebGPU is available in this browser. + */ + static isSupported(): boolean { + return typeof navigator !== 'undefined' && 'gpu' in navigator; + } + + /** + * Load a model. Idempotent — returns immediately if already loaded. + * Model weights are cached in browser Cache API for instant reload. + */ + async load(model: ModelKey = DEFAULT_MODEL): Promise { + // Already loaded with this model + if (this.engine && this.currentModel === model) return; + + // Already loading + if (this.loadPromise && this.currentModel === model) return this.loadPromise; + + // Unload previous model if switching + if (this.engine && this.currentModel !== model) { + await this.unload(); + } + + this.currentModel = model; + this.loadPromise = this._load(model); + return this.loadPromise; + } + + private async _load(model: ModelKey): Promise { + if (!LocalLLMEngine.isSupported()) { + this.setStatus({ state: 'error', error: 'WebGPU not supported in this browser' }); + throw new Error('WebGPU not supported'); + } + + this.setStatus({ state: 'checking' }); + + try { + const { CreateMLCEngine } = await import('@mlc-ai/web-llm'); + const config = MODELS[model]; + + this.engine = await CreateMLCEngine(config.modelId, { + initProgressCallback: (report) => { + if (report.progress < 1) { + this.setStatus({ + state: 'downloading', + progress: report.progress, + text: report.text, + }); + } else { + this.setStatus({ state: 'loading', text: 'Initializing model...' }); + } + }, + }); + + this.setStatus({ state: 'ready' }); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + this.setStatus({ state: 'error', error: message }); + this.loadPromise = null; + throw err; + } + } + + /** + * Unload the model and free memory. + */ + async unload(): Promise { + if (this.engine) { + await this.engine.unload(); + this.engine = null; + } + this.currentModel = null; + this.loadPromise = null; + this.setStatus({ state: 'idle' }); + } + + /** + * Generate a response. Auto-loads the model if not yet loaded. + */ + async generate(options: GenerateOptions): Promise { + if (!this.engine) { + await this.load(); + } + + const start = performance.now(); + + if (options.onToken) { + return this._generateStreaming(options, start); + } + + const response = await this.engine!.chat.completions.create({ + messages: options.messages, + temperature: options.temperature ?? 0.7, + max_tokens: options.maxTokens ?? 1024, + stream: false, + }); + + const choice = response.choices[0]; + return { + content: choice.message.content ?? '', + usage: { + prompt_tokens: response.usage?.prompt_tokens ?? 0, + completion_tokens: response.usage?.completion_tokens ?? 0, + total_tokens: response.usage?.total_tokens ?? 0, + }, + latencyMs: Math.round(performance.now() - start), + }; + } + + private async _generateStreaming( + options: GenerateOptions, + start: number + ): Promise { + const chunks: string[] = []; + let usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }; + + const stream = await this.engine!.chat.completions.create({ + messages: options.messages, + temperature: options.temperature ?? 0.7, + max_tokens: options.maxTokens ?? 1024, + stream: true, + stream_options: { include_usage: true }, + }); + + for await (const chunk of stream) { + const delta = chunk.choices[0]?.delta?.content; + if (delta) { + chunks.push(delta); + options.onToken!(delta); + } + if (chunk.usage) { + usage = { + prompt_tokens: chunk.usage.prompt_tokens, + completion_tokens: chunk.usage.completion_tokens, + total_tokens: chunk.usage.total_tokens, + }; + } + } + + return { + content: chunks.join(''), + usage, + latencyMs: Math.round(performance.now() - start), + }; + } + + /** + * Convenience: single prompt → response. + */ + async prompt( + text: string, + opts?: { systemPrompt?: string; temperature?: number; maxTokens?: number } + ): Promise { + const messages: ChatMessage[] = []; + if (opts?.systemPrompt) { + messages.push({ role: 'system', content: opts.systemPrompt }); + } + messages.push({ role: 'user', content: text }); + + const result = await this.generate({ + messages, + temperature: opts?.temperature, + maxTokens: opts?.maxTokens, + }); + return result.content; + } + + /** + * Convenience: extract structured JSON from text. + */ + async extractJson( + text: string, + instruction: string, + opts?: { temperature?: number } + ): Promise { + const result = await this.generate({ + messages: [ + { + role: 'system', + content: + 'You are a JSON extraction assistant. Always respond with valid JSON only, no markdown, no explanation.', + }, + { + role: 'user', + content: `${instruction}\n\nText:\n${text}`, + }, + ], + temperature: opts?.temperature ?? 0.1, + maxTokens: 2048, + }); + + return JSON.parse(result.content) as T; + } + + /** + * Convenience: classify text into categories. + */ + async classify(text: string, categories: string[], opts?: { context?: string }): Promise { + const categoryList = categories.map((c) => `"${c}"`).join(', '); + const result = await this.generate({ + messages: [ + { + role: 'system', + content: `Classify the text into exactly one of these categories: ${categoryList}. Respond with only the category name, nothing else.${opts?.context ? ` Context: ${opts.context}` : ''}`, + }, + { role: 'user', content: text }, + ], + temperature: 0, + maxTokens: 50, + }); + + const normalized = result.content.trim().replace(/^["']|["']$/g, ''); + // Return the closest matching category + const match = categories.find((c) => c.toLowerCase() === normalized.toLowerCase()); + return match ?? normalized; + } +} + +/** Singleton instance for app-wide use */ +export const localLLM = new LocalLLMEngine(); diff --git a/packages/local-llm/src/index.ts b/packages/local-llm/src/index.ts new file mode 100644 index 000000000..93092818f --- /dev/null +++ b/packages/local-llm/src/index.ts @@ -0,0 +1,27 @@ +// Engine +export { LocalLLMEngine, localLLM } from './engine'; + +// Models +export { MODELS, DEFAULT_MODEL } from './models'; +export type { ModelKey } from './models'; + +// Types +export type { + ChatMessage, + GenerateOptions, + GenerateResult, + ModelConfig, + LoadingStatus, +} from './types'; + +// Svelte 5 reactive helpers +export { + getLocalLlmStatus, + loadLocalLlm, + unloadLocalLlm, + isLocalLlmSupported, + generate, + generateText, + extractJson, + classify, +} from './svelte.svelte'; diff --git a/packages/local-llm/src/models.ts b/packages/local-llm/src/models.ts new file mode 100644 index 000000000..812a1d8ec --- /dev/null +++ b/packages/local-llm/src/models.ts @@ -0,0 +1,27 @@ +import type { ModelConfig } from './types'; + +/** + * Pre-configured models for client-side inference. + * All models are quantized for browser use via WebLLM/MLC. + */ + +export const MODELS = { + /** Default model — fast, good at structured output, multilingual */ + 'qwen-2.5-1.5b': { + modelId: 'Qwen2.5-1.5B-Instruct-q4f16_1-MLC', + displayName: 'Qwen 2.5 1.5B', + downloadSizeMb: 1000, + ramUsageMb: 1800, + }, + /** Smaller variant for low-end devices */ + 'qwen-2.5-0.5b': { + modelId: 'Qwen2.5-0.5B-Instruct-q4f16_1-MLC', + displayName: 'Qwen 2.5 0.5B', + downloadSizeMb: 400, + ramUsageMb: 800, + }, +} as const satisfies Record; + +export type ModelKey = keyof typeof MODELS; + +export const DEFAULT_MODEL: ModelKey = 'qwen-2.5-1.5b'; diff --git a/packages/local-llm/src/svelte.svelte.ts b/packages/local-llm/src/svelte.svelte.ts new file mode 100644 index 000000000..5cd4abe17 --- /dev/null +++ b/packages/local-llm/src/svelte.svelte.ts @@ -0,0 +1,87 @@ +/** + * Svelte 5 reactive integration for LocalLLMEngine. + * + * Usage in a Svelte component: + * import { getLocalLlmStatus, loadLocalLlm, generateText } from '@manacore/local-llm'; + * + * const status = getLocalLlmStatus(); + * loadLocalLlm(); + * // use status.current reactively + */ + +import { LocalLLMEngine, localLLM } from './engine'; +import type { LoadingStatus, GenerateOptions, GenerateResult } from './types'; +import type { ModelKey } from './models'; + +/** + * Reactive status using Svelte 5 $state rune. + */ +let _status = $state({ state: 'idle' }); + +localLLM.onStatusChange((s) => { + _status = s; +}); + +export function getLocalLlmStatus(): { readonly current: LoadingStatus } { + return { + get current() { + return _status; + }, + }; +} + +/** + * Load the model. Safe to call multiple times. + */ +export async function loadLocalLlm(model?: ModelKey): Promise { + return localLLM.load(model); +} + +/** + * Unload the model and free memory. + */ +export async function unloadLocalLlm(): Promise { + return localLLM.unload(); +} + +/** + * Check if WebGPU is available. + */ +export function isLocalLlmSupported(): boolean { + return LocalLLMEngine.isSupported(); +} + +/** + * Generate with full options (messages, streaming, etc.) + */ +export async function generate(options: GenerateOptions): Promise { + return localLLM.generate(options); +} + +/** + * Quick text generation from a single prompt. + */ +export async function generateText( + prompt: string, + opts?: { systemPrompt?: string; temperature?: number; maxTokens?: number } +): Promise { + return localLLM.prompt(prompt, opts); +} + +/** + * Extract structured JSON from text. + */ +export async function extractJson(text: string, instruction: string): Promise { + return localLLM.extractJson(text, instruction); +} + +/** + * Classify text into one of the given categories. + */ +export async function classify( + text: string, + categories: string[], + opts?: { context?: string } +): Promise { + return localLLM.classify(text, categories, opts); +} diff --git a/packages/local-llm/src/types.ts b/packages/local-llm/src/types.ts new file mode 100644 index 000000000..f6edb1302 --- /dev/null +++ b/packages/local-llm/src/types.ts @@ -0,0 +1,54 @@ +/** + * Types for client-side LLM inference. + * Aligned with @manacore/shared-llm ChatMessage/ChatResult where possible. + */ + +export interface ChatMessage { + role: 'system' | 'user' | 'assistant'; + content: string; +} + +export interface GenerateOptions { + /** Messages to send */ + messages: ChatMessage[]; + /** Sampling temperature 0.0-2.0 (default: 0.7) */ + temperature?: number; + /** Max tokens to generate (default: 1024) */ + maxTokens?: number; + /** Callback for each generated token (streaming) */ + onToken?: (token: string) => void; +} + +export interface GenerateResult { + /** Generated text */ + content: string; + /** Token usage */ + usage: { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; + }; + /** Generation time in ms */ + latencyMs: number; +} + +export interface ModelConfig { + /** WebLLM model identifier */ + modelId: string; + /** Human-readable name */ + displayName: string; + /** Approximate download size in MB */ + downloadSizeMb: number; + /** Approximate VRAM/RAM usage in MB */ + ramUsageMb: number; + /** Default system prompt */ + defaultSystemPrompt?: string; +} + +export type LoadingStatus = + | { state: 'idle' } + | { state: 'checking' } + | { state: 'downloading'; progress: number; text: string } + | { state: 'loading'; text: string } + | { state: 'ready' } + | { state: 'error'; error: string }; diff --git a/packages/local-llm/tsconfig.json b/packages/local-llm/tsconfig.json new file mode 100644 index 000000000..897ca8cba --- /dev/null +++ b/packages/local-llm/tsconfig.json @@ -0,0 +1,14 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "lib": ["ES2022", "DOM"], + "strict": true, + "noEmit": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules"] +}