feat(local-llm): add client-side LLM inference package with WebLLM

New shared package for browser-based LLM inference using Qwen 2.5 1.5B
via WebLLM. Includes Svelte 5 reactive stores, engine management, and
type definitions for local AI features without server roundtrips.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-02 01:53:54 +02:00
parent 4116715db0
commit ef538245d1
7 changed files with 501 additions and 0 deletions

View file

@ -0,0 +1,26 @@
{
"name": "@manacore/local-llm",
"version": "0.1.0",
"private": true,
"description": "Client-side LLM inference via WebLLM (Qwen 2.5 1.5B) with Svelte 5 reactive stores",
"main": "./src/index.ts",
"types": "./src/index.ts",
"exports": {
".": "./src/index.ts"
},
"scripts": {
"type-check": "tsc --noEmit",
"clean": "rm -rf dist"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.78"
},
"devDependencies": {
"@types/node": "^24.10.1",
"svelte": "^5.0.0",
"typescript": "^5.9.3"
},
"peerDependencies": {
"svelte": "^5.0.0"
}
}

View file

@ -0,0 +1,266 @@
/**
* LocalLLMEngine WebLLM wrapper for client-side inference.
*
* Lazy-loads the model on first use, caches weights in browser Cache API.
* Provides both one-shot and streaming generation.
*/
import type { MLCEngine } from '@mlc-ai/web-llm';
import type { ChatMessage, GenerateOptions, GenerateResult, LoadingStatus } from './types';
import type { ModelConfig } from './types';
import { MODELS, DEFAULT_MODEL, type ModelKey } from './models';
export class LocalLLMEngine {
private engine: MLCEngine | null = null;
private loadPromise: Promise<void> | null = null;
private currentModel: ModelKey | null = null;
private _status: LoadingStatus = { state: 'idle' };
private statusListeners: Set<(status: LoadingStatus) => void> = new Set();
get status(): LoadingStatus {
return this._status;
}
get isReady(): boolean {
return this._status.state === 'ready';
}
get modelConfig(): ModelConfig | null {
return this.currentModel ? MODELS[this.currentModel] : null;
}
/**
* Subscribe to status changes (for non-Svelte usage).
*/
onStatusChange(listener: (status: LoadingStatus) => void): () => void {
this.statusListeners.add(listener);
return () => this.statusListeners.delete(listener);
}
private setStatus(status: LoadingStatus) {
this._status = status;
for (const listener of this.statusListeners) {
listener(status);
}
}
/**
* Check if WebGPU is available in this browser.
*/
static isSupported(): boolean {
return typeof navigator !== 'undefined' && 'gpu' in navigator;
}
/**
* Load a model. Idempotent returns immediately if already loaded.
* Model weights are cached in browser Cache API for instant reload.
*/
async load(model: ModelKey = DEFAULT_MODEL): Promise<void> {
// Already loaded with this model
if (this.engine && this.currentModel === model) return;
// Already loading
if (this.loadPromise && this.currentModel === model) return this.loadPromise;
// Unload previous model if switching
if (this.engine && this.currentModel !== model) {
await this.unload();
}
this.currentModel = model;
this.loadPromise = this._load(model);
return this.loadPromise;
}
private async _load(model: ModelKey): Promise<void> {
if (!LocalLLMEngine.isSupported()) {
this.setStatus({ state: 'error', error: 'WebGPU not supported in this browser' });
throw new Error('WebGPU not supported');
}
this.setStatus({ state: 'checking' });
try {
const { CreateMLCEngine } = await import('@mlc-ai/web-llm');
const config = MODELS[model];
this.engine = await CreateMLCEngine(config.modelId, {
initProgressCallback: (report) => {
if (report.progress < 1) {
this.setStatus({
state: 'downloading',
progress: report.progress,
text: report.text,
});
} else {
this.setStatus({ state: 'loading', text: 'Initializing model...' });
}
},
});
this.setStatus({ state: 'ready' });
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
this.setStatus({ state: 'error', error: message });
this.loadPromise = null;
throw err;
}
}
/**
* Unload the model and free memory.
*/
async unload(): Promise<void> {
if (this.engine) {
await this.engine.unload();
this.engine = null;
}
this.currentModel = null;
this.loadPromise = null;
this.setStatus({ state: 'idle' });
}
/**
* Generate a response. Auto-loads the model if not yet loaded.
*/
async generate(options: GenerateOptions): Promise<GenerateResult> {
if (!this.engine) {
await this.load();
}
const start = performance.now();
if (options.onToken) {
return this._generateStreaming(options, start);
}
const response = await this.engine!.chat.completions.create({
messages: options.messages,
temperature: options.temperature ?? 0.7,
max_tokens: options.maxTokens ?? 1024,
stream: false,
});
const choice = response.choices[0];
return {
content: choice.message.content ?? '',
usage: {
prompt_tokens: response.usage?.prompt_tokens ?? 0,
completion_tokens: response.usage?.completion_tokens ?? 0,
total_tokens: response.usage?.total_tokens ?? 0,
},
latencyMs: Math.round(performance.now() - start),
};
}
private async _generateStreaming(
options: GenerateOptions,
start: number
): Promise<GenerateResult> {
const chunks: string[] = [];
let usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
const stream = await this.engine!.chat.completions.create({
messages: options.messages,
temperature: options.temperature ?? 0.7,
max_tokens: options.maxTokens ?? 1024,
stream: true,
stream_options: { include_usage: true },
});
for await (const chunk of stream) {
const delta = chunk.choices[0]?.delta?.content;
if (delta) {
chunks.push(delta);
options.onToken!(delta);
}
if (chunk.usage) {
usage = {
prompt_tokens: chunk.usage.prompt_tokens,
completion_tokens: chunk.usage.completion_tokens,
total_tokens: chunk.usage.total_tokens,
};
}
}
return {
content: chunks.join(''),
usage,
latencyMs: Math.round(performance.now() - start),
};
}
/**
* Convenience: single prompt response.
*/
async prompt(
text: string,
opts?: { systemPrompt?: string; temperature?: number; maxTokens?: number }
): Promise<string> {
const messages: ChatMessage[] = [];
if (opts?.systemPrompt) {
messages.push({ role: 'system', content: opts.systemPrompt });
}
messages.push({ role: 'user', content: text });
const result = await this.generate({
messages,
temperature: opts?.temperature,
maxTokens: opts?.maxTokens,
});
return result.content;
}
/**
* Convenience: extract structured JSON from text.
*/
async extractJson<T = unknown>(
text: string,
instruction: string,
opts?: { temperature?: number }
): Promise<T> {
const result = await this.generate({
messages: [
{
role: 'system',
content:
'You are a JSON extraction assistant. Always respond with valid JSON only, no markdown, no explanation.',
},
{
role: 'user',
content: `${instruction}\n\nText:\n${text}`,
},
],
temperature: opts?.temperature ?? 0.1,
maxTokens: 2048,
});
return JSON.parse(result.content) as T;
}
/**
* Convenience: classify text into categories.
*/
async classify(text: string, categories: string[], opts?: { context?: string }): Promise<string> {
const categoryList = categories.map((c) => `"${c}"`).join(', ');
const result = await this.generate({
messages: [
{
role: 'system',
content: `Classify the text into exactly one of these categories: ${categoryList}. Respond with only the category name, nothing else.${opts?.context ? ` Context: ${opts.context}` : ''}`,
},
{ role: 'user', content: text },
],
temperature: 0,
maxTokens: 50,
});
const normalized = result.content.trim().replace(/^["']|["']$/g, '');
// Return the closest matching category
const match = categories.find((c) => c.toLowerCase() === normalized.toLowerCase());
return match ?? normalized;
}
}
/** Singleton instance for app-wide use */
export const localLLM = new LocalLLMEngine();

View file

@ -0,0 +1,27 @@
// Engine
export { LocalLLMEngine, localLLM } from './engine';
// Models
export { MODELS, DEFAULT_MODEL } from './models';
export type { ModelKey } from './models';
// Types
export type {
ChatMessage,
GenerateOptions,
GenerateResult,
ModelConfig,
LoadingStatus,
} from './types';
// Svelte 5 reactive helpers
export {
getLocalLlmStatus,
loadLocalLlm,
unloadLocalLlm,
isLocalLlmSupported,
generate,
generateText,
extractJson,
classify,
} from './svelte.svelte';

View file

@ -0,0 +1,27 @@
import type { ModelConfig } from './types';
/**
* Pre-configured models for client-side inference.
* All models are quantized for browser use via WebLLM/MLC.
*/
export const MODELS = {
/** Default model — fast, good at structured output, multilingual */
'qwen-2.5-1.5b': {
modelId: 'Qwen2.5-1.5B-Instruct-q4f16_1-MLC',
displayName: 'Qwen 2.5 1.5B',
downloadSizeMb: 1000,
ramUsageMb: 1800,
},
/** Smaller variant for low-end devices */
'qwen-2.5-0.5b': {
modelId: 'Qwen2.5-0.5B-Instruct-q4f16_1-MLC',
displayName: 'Qwen 2.5 0.5B',
downloadSizeMb: 400,
ramUsageMb: 800,
},
} as const satisfies Record<string, ModelConfig>;
export type ModelKey = keyof typeof MODELS;
export const DEFAULT_MODEL: ModelKey = 'qwen-2.5-1.5b';

View file

@ -0,0 +1,87 @@
/**
* Svelte 5 reactive integration for LocalLLMEngine.
*
* Usage in a Svelte component:
* import { getLocalLlmStatus, loadLocalLlm, generateText } from '@manacore/local-llm';
*
* const status = getLocalLlmStatus();
* loadLocalLlm();
* // use status.current reactively
*/
import { LocalLLMEngine, localLLM } from './engine';
import type { LoadingStatus, GenerateOptions, GenerateResult } from './types';
import type { ModelKey } from './models';
/**
* Reactive status using Svelte 5 $state rune.
*/
let _status = $state<LoadingStatus>({ state: 'idle' });
localLLM.onStatusChange((s) => {
_status = s;
});
export function getLocalLlmStatus(): { readonly current: LoadingStatus } {
return {
get current() {
return _status;
},
};
}
/**
* Load the model. Safe to call multiple times.
*/
export async function loadLocalLlm(model?: ModelKey): Promise<void> {
return localLLM.load(model);
}
/**
* Unload the model and free memory.
*/
export async function unloadLocalLlm(): Promise<void> {
return localLLM.unload();
}
/**
* Check if WebGPU is available.
*/
export function isLocalLlmSupported(): boolean {
return LocalLLMEngine.isSupported();
}
/**
* Generate with full options (messages, streaming, etc.)
*/
export async function generate(options: GenerateOptions): Promise<GenerateResult> {
return localLLM.generate(options);
}
/**
* Quick text generation from a single prompt.
*/
export async function generateText(
prompt: string,
opts?: { systemPrompt?: string; temperature?: number; maxTokens?: number }
): Promise<string> {
return localLLM.prompt(prompt, opts);
}
/**
* Extract structured JSON from text.
*/
export async function extractJson<T = unknown>(text: string, instruction: string): Promise<T> {
return localLLM.extractJson<T>(text, instruction);
}
/**
* Classify text into one of the given categories.
*/
export async function classify(
text: string,
categories: string[],
opts?: { context?: string }
): Promise<string> {
return localLLM.classify(text, categories, opts);
}

View file

@ -0,0 +1,54 @@
/**
* Types for client-side LLM inference.
* Aligned with @manacore/shared-llm ChatMessage/ChatResult where possible.
*/
export interface ChatMessage {
role: 'system' | 'user' | 'assistant';
content: string;
}
export interface GenerateOptions {
/** Messages to send */
messages: ChatMessage[];
/** Sampling temperature 0.0-2.0 (default: 0.7) */
temperature?: number;
/** Max tokens to generate (default: 1024) */
maxTokens?: number;
/** Callback for each generated token (streaming) */
onToken?: (token: string) => void;
}
export interface GenerateResult {
/** Generated text */
content: string;
/** Token usage */
usage: {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
};
/** Generation time in ms */
latencyMs: number;
}
export interface ModelConfig {
/** WebLLM model identifier */
modelId: string;
/** Human-readable name */
displayName: string;
/** Approximate download size in MB */
downloadSizeMb: number;
/** Approximate VRAM/RAM usage in MB */
ramUsageMb: number;
/** Default system prompt */
defaultSystemPrompt?: string;
}
export type LoadingStatus =
| { state: 'idle' }
| { state: 'checking' }
| { state: 'downloading'; progress: number; text: string }
| { state: 'loading'; text: string }
| { state: 'ready' }
| { state: 'error'; error: string };

View file

@ -0,0 +1,14 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "ESNext",
"moduleResolution": "bundler",
"lib": ["ES2022", "DOM"],
"strict": true,
"noEmit": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true
},
"include": ["src/**/*"],
"exclude": ["node_modules"]
}