mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 22:41:09 +02:00
feat(local-llm): add client-side LLM inference package with WebLLM
New shared package for browser-based LLM inference using Qwen 2.5 1.5B via WebLLM. Includes Svelte 5 reactive stores, engine management, and type definitions for local AI features without server roundtrips. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4116715db0
commit
ef538245d1
7 changed files with 501 additions and 0 deletions
26
packages/local-llm/package.json
Normal file
26
packages/local-llm/package.json
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
{
|
||||
"name": "@manacore/local-llm",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"description": "Client-side LLM inference via WebLLM (Qwen 2.5 1.5B) with Svelte 5 reactive stores",
|
||||
"main": "./src/index.ts",
|
||||
"types": "./src/index.ts",
|
||||
"exports": {
|
||||
".": "./src/index.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"type-check": "tsc --noEmit",
|
||||
"clean": "rm -rf dist"
|
||||
},
|
||||
"dependencies": {
|
||||
"@mlc-ai/web-llm": "^0.2.78"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^24.10.1",
|
||||
"svelte": "^5.0.0",
|
||||
"typescript": "^5.9.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"svelte": "^5.0.0"
|
||||
}
|
||||
}
|
||||
266
packages/local-llm/src/engine.ts
Normal file
266
packages/local-llm/src/engine.ts
Normal file
|
|
@ -0,0 +1,266 @@
|
|||
/**
|
||||
* LocalLLMEngine — WebLLM wrapper for client-side inference.
|
||||
*
|
||||
* Lazy-loads the model on first use, caches weights in browser Cache API.
|
||||
* Provides both one-shot and streaming generation.
|
||||
*/
|
||||
|
||||
import type { MLCEngine } from '@mlc-ai/web-llm';
|
||||
import type { ChatMessage, GenerateOptions, GenerateResult, LoadingStatus } from './types';
|
||||
import type { ModelConfig } from './types';
|
||||
import { MODELS, DEFAULT_MODEL, type ModelKey } from './models';
|
||||
|
||||
export class LocalLLMEngine {
|
||||
private engine: MLCEngine | null = null;
|
||||
private loadPromise: Promise<void> | null = null;
|
||||
private currentModel: ModelKey | null = null;
|
||||
private _status: LoadingStatus = { state: 'idle' };
|
||||
private statusListeners: Set<(status: LoadingStatus) => void> = new Set();
|
||||
|
||||
get status(): LoadingStatus {
|
||||
return this._status;
|
||||
}
|
||||
|
||||
get isReady(): boolean {
|
||||
return this._status.state === 'ready';
|
||||
}
|
||||
|
||||
get modelConfig(): ModelConfig | null {
|
||||
return this.currentModel ? MODELS[this.currentModel] : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subscribe to status changes (for non-Svelte usage).
|
||||
*/
|
||||
onStatusChange(listener: (status: LoadingStatus) => void): () => void {
|
||||
this.statusListeners.add(listener);
|
||||
return () => this.statusListeners.delete(listener);
|
||||
}
|
||||
|
||||
private setStatus(status: LoadingStatus) {
|
||||
this._status = status;
|
||||
for (const listener of this.statusListeners) {
|
||||
listener(status);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if WebGPU is available in this browser.
|
||||
*/
|
||||
static isSupported(): boolean {
|
||||
return typeof navigator !== 'undefined' && 'gpu' in navigator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a model. Idempotent — returns immediately if already loaded.
|
||||
* Model weights are cached in browser Cache API for instant reload.
|
||||
*/
|
||||
async load(model: ModelKey = DEFAULT_MODEL): Promise<void> {
|
||||
// Already loaded with this model
|
||||
if (this.engine && this.currentModel === model) return;
|
||||
|
||||
// Already loading
|
||||
if (this.loadPromise && this.currentModel === model) return this.loadPromise;
|
||||
|
||||
// Unload previous model if switching
|
||||
if (this.engine && this.currentModel !== model) {
|
||||
await this.unload();
|
||||
}
|
||||
|
||||
this.currentModel = model;
|
||||
this.loadPromise = this._load(model);
|
||||
return this.loadPromise;
|
||||
}
|
||||
|
||||
private async _load(model: ModelKey): Promise<void> {
|
||||
if (!LocalLLMEngine.isSupported()) {
|
||||
this.setStatus({ state: 'error', error: 'WebGPU not supported in this browser' });
|
||||
throw new Error('WebGPU not supported');
|
||||
}
|
||||
|
||||
this.setStatus({ state: 'checking' });
|
||||
|
||||
try {
|
||||
const { CreateMLCEngine } = await import('@mlc-ai/web-llm');
|
||||
const config = MODELS[model];
|
||||
|
||||
this.engine = await CreateMLCEngine(config.modelId, {
|
||||
initProgressCallback: (report) => {
|
||||
if (report.progress < 1) {
|
||||
this.setStatus({
|
||||
state: 'downloading',
|
||||
progress: report.progress,
|
||||
text: report.text,
|
||||
});
|
||||
} else {
|
||||
this.setStatus({ state: 'loading', text: 'Initializing model...' });
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
this.setStatus({ state: 'ready' });
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
this.setStatus({ state: 'error', error: message });
|
||||
this.loadPromise = null;
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unload the model and free memory.
|
||||
*/
|
||||
async unload(): Promise<void> {
|
||||
if (this.engine) {
|
||||
await this.engine.unload();
|
||||
this.engine = null;
|
||||
}
|
||||
this.currentModel = null;
|
||||
this.loadPromise = null;
|
||||
this.setStatus({ state: 'idle' });
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a response. Auto-loads the model if not yet loaded.
|
||||
*/
|
||||
async generate(options: GenerateOptions): Promise<GenerateResult> {
|
||||
if (!this.engine) {
|
||||
await this.load();
|
||||
}
|
||||
|
||||
const start = performance.now();
|
||||
|
||||
if (options.onToken) {
|
||||
return this._generateStreaming(options, start);
|
||||
}
|
||||
|
||||
const response = await this.engine!.chat.completions.create({
|
||||
messages: options.messages,
|
||||
temperature: options.temperature ?? 0.7,
|
||||
max_tokens: options.maxTokens ?? 1024,
|
||||
stream: false,
|
||||
});
|
||||
|
||||
const choice = response.choices[0];
|
||||
return {
|
||||
content: choice.message.content ?? '',
|
||||
usage: {
|
||||
prompt_tokens: response.usage?.prompt_tokens ?? 0,
|
||||
completion_tokens: response.usage?.completion_tokens ?? 0,
|
||||
total_tokens: response.usage?.total_tokens ?? 0,
|
||||
},
|
||||
latencyMs: Math.round(performance.now() - start),
|
||||
};
|
||||
}
|
||||
|
||||
private async _generateStreaming(
|
||||
options: GenerateOptions,
|
||||
start: number
|
||||
): Promise<GenerateResult> {
|
||||
const chunks: string[] = [];
|
||||
let usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
|
||||
|
||||
const stream = await this.engine!.chat.completions.create({
|
||||
messages: options.messages,
|
||||
temperature: options.temperature ?? 0.7,
|
||||
max_tokens: options.maxTokens ?? 1024,
|
||||
stream: true,
|
||||
stream_options: { include_usage: true },
|
||||
});
|
||||
|
||||
for await (const chunk of stream) {
|
||||
const delta = chunk.choices[0]?.delta?.content;
|
||||
if (delta) {
|
||||
chunks.push(delta);
|
||||
options.onToken!(delta);
|
||||
}
|
||||
if (chunk.usage) {
|
||||
usage = {
|
||||
prompt_tokens: chunk.usage.prompt_tokens,
|
||||
completion_tokens: chunk.usage.completion_tokens,
|
||||
total_tokens: chunk.usage.total_tokens,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
content: chunks.join(''),
|
||||
usage,
|
||||
latencyMs: Math.round(performance.now() - start),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience: single prompt → response.
|
||||
*/
|
||||
async prompt(
|
||||
text: string,
|
||||
opts?: { systemPrompt?: string; temperature?: number; maxTokens?: number }
|
||||
): Promise<string> {
|
||||
const messages: ChatMessage[] = [];
|
||||
if (opts?.systemPrompt) {
|
||||
messages.push({ role: 'system', content: opts.systemPrompt });
|
||||
}
|
||||
messages.push({ role: 'user', content: text });
|
||||
|
||||
const result = await this.generate({
|
||||
messages,
|
||||
temperature: opts?.temperature,
|
||||
maxTokens: opts?.maxTokens,
|
||||
});
|
||||
return result.content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience: extract structured JSON from text.
|
||||
*/
|
||||
async extractJson<T = unknown>(
|
||||
text: string,
|
||||
instruction: string,
|
||||
opts?: { temperature?: number }
|
||||
): Promise<T> {
|
||||
const result = await this.generate({
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content:
|
||||
'You are a JSON extraction assistant. Always respond with valid JSON only, no markdown, no explanation.',
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: `${instruction}\n\nText:\n${text}`,
|
||||
},
|
||||
],
|
||||
temperature: opts?.temperature ?? 0.1,
|
||||
maxTokens: 2048,
|
||||
});
|
||||
|
||||
return JSON.parse(result.content) as T;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience: classify text into categories.
|
||||
*/
|
||||
async classify(text: string, categories: string[], opts?: { context?: string }): Promise<string> {
|
||||
const categoryList = categories.map((c) => `"${c}"`).join(', ');
|
||||
const result = await this.generate({
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: `Classify the text into exactly one of these categories: ${categoryList}. Respond with only the category name, nothing else.${opts?.context ? ` Context: ${opts.context}` : ''}`,
|
||||
},
|
||||
{ role: 'user', content: text },
|
||||
],
|
||||
temperature: 0,
|
||||
maxTokens: 50,
|
||||
});
|
||||
|
||||
const normalized = result.content.trim().replace(/^["']|["']$/g, '');
|
||||
// Return the closest matching category
|
||||
const match = categories.find((c) => c.toLowerCase() === normalized.toLowerCase());
|
||||
return match ?? normalized;
|
||||
}
|
||||
}
|
||||
|
||||
/** Singleton instance for app-wide use */
|
||||
export const localLLM = new LocalLLMEngine();
|
||||
27
packages/local-llm/src/index.ts
Normal file
27
packages/local-llm/src/index.ts
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
// Engine
|
||||
export { LocalLLMEngine, localLLM } from './engine';
|
||||
|
||||
// Models
|
||||
export { MODELS, DEFAULT_MODEL } from './models';
|
||||
export type { ModelKey } from './models';
|
||||
|
||||
// Types
|
||||
export type {
|
||||
ChatMessage,
|
||||
GenerateOptions,
|
||||
GenerateResult,
|
||||
ModelConfig,
|
||||
LoadingStatus,
|
||||
} from './types';
|
||||
|
||||
// Svelte 5 reactive helpers
|
||||
export {
|
||||
getLocalLlmStatus,
|
||||
loadLocalLlm,
|
||||
unloadLocalLlm,
|
||||
isLocalLlmSupported,
|
||||
generate,
|
||||
generateText,
|
||||
extractJson,
|
||||
classify,
|
||||
} from './svelte.svelte';
|
||||
27
packages/local-llm/src/models.ts
Normal file
27
packages/local-llm/src/models.ts
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
import type { ModelConfig } from './types';
|
||||
|
||||
/**
|
||||
* Pre-configured models for client-side inference.
|
||||
* All models are quantized for browser use via WebLLM/MLC.
|
||||
*/
|
||||
|
||||
export const MODELS = {
|
||||
/** Default model — fast, good at structured output, multilingual */
|
||||
'qwen-2.5-1.5b': {
|
||||
modelId: 'Qwen2.5-1.5B-Instruct-q4f16_1-MLC',
|
||||
displayName: 'Qwen 2.5 1.5B',
|
||||
downloadSizeMb: 1000,
|
||||
ramUsageMb: 1800,
|
||||
},
|
||||
/** Smaller variant for low-end devices */
|
||||
'qwen-2.5-0.5b': {
|
||||
modelId: 'Qwen2.5-0.5B-Instruct-q4f16_1-MLC',
|
||||
displayName: 'Qwen 2.5 0.5B',
|
||||
downloadSizeMb: 400,
|
||||
ramUsageMb: 800,
|
||||
},
|
||||
} as const satisfies Record<string, ModelConfig>;
|
||||
|
||||
export type ModelKey = keyof typeof MODELS;
|
||||
|
||||
export const DEFAULT_MODEL: ModelKey = 'qwen-2.5-1.5b';
|
||||
87
packages/local-llm/src/svelte.svelte.ts
Normal file
87
packages/local-llm/src/svelte.svelte.ts
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
/**
|
||||
* Svelte 5 reactive integration for LocalLLMEngine.
|
||||
*
|
||||
* Usage in a Svelte component:
|
||||
* import { getLocalLlmStatus, loadLocalLlm, generateText } from '@manacore/local-llm';
|
||||
*
|
||||
* const status = getLocalLlmStatus();
|
||||
* loadLocalLlm();
|
||||
* // use status.current reactively
|
||||
*/
|
||||
|
||||
import { LocalLLMEngine, localLLM } from './engine';
|
||||
import type { LoadingStatus, GenerateOptions, GenerateResult } from './types';
|
||||
import type { ModelKey } from './models';
|
||||
|
||||
/**
|
||||
* Reactive status using Svelte 5 $state rune.
|
||||
*/
|
||||
let _status = $state<LoadingStatus>({ state: 'idle' });
|
||||
|
||||
localLLM.onStatusChange((s) => {
|
||||
_status = s;
|
||||
});
|
||||
|
||||
export function getLocalLlmStatus(): { readonly current: LoadingStatus } {
|
||||
return {
|
||||
get current() {
|
||||
return _status;
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the model. Safe to call multiple times.
|
||||
*/
|
||||
export async function loadLocalLlm(model?: ModelKey): Promise<void> {
|
||||
return localLLM.load(model);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unload the model and free memory.
|
||||
*/
|
||||
export async function unloadLocalLlm(): Promise<void> {
|
||||
return localLLM.unload();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if WebGPU is available.
|
||||
*/
|
||||
export function isLocalLlmSupported(): boolean {
|
||||
return LocalLLMEngine.isSupported();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate with full options (messages, streaming, etc.)
|
||||
*/
|
||||
export async function generate(options: GenerateOptions): Promise<GenerateResult> {
|
||||
return localLLM.generate(options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Quick text generation from a single prompt.
|
||||
*/
|
||||
export async function generateText(
|
||||
prompt: string,
|
||||
opts?: { systemPrompt?: string; temperature?: number; maxTokens?: number }
|
||||
): Promise<string> {
|
||||
return localLLM.prompt(prompt, opts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structured JSON from text.
|
||||
*/
|
||||
export async function extractJson<T = unknown>(text: string, instruction: string): Promise<T> {
|
||||
return localLLM.extractJson<T>(text, instruction);
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify text into one of the given categories.
|
||||
*/
|
||||
export async function classify(
|
||||
text: string,
|
||||
categories: string[],
|
||||
opts?: { context?: string }
|
||||
): Promise<string> {
|
||||
return localLLM.classify(text, categories, opts);
|
||||
}
|
||||
54
packages/local-llm/src/types.ts
Normal file
54
packages/local-llm/src/types.ts
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Types for client-side LLM inference.
|
||||
* Aligned with @manacore/shared-llm ChatMessage/ChatResult where possible.
|
||||
*/
|
||||
|
||||
export interface ChatMessage {
|
||||
role: 'system' | 'user' | 'assistant';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface GenerateOptions {
|
||||
/** Messages to send */
|
||||
messages: ChatMessage[];
|
||||
/** Sampling temperature 0.0-2.0 (default: 0.7) */
|
||||
temperature?: number;
|
||||
/** Max tokens to generate (default: 1024) */
|
||||
maxTokens?: number;
|
||||
/** Callback for each generated token (streaming) */
|
||||
onToken?: (token: string) => void;
|
||||
}
|
||||
|
||||
export interface GenerateResult {
|
||||
/** Generated text */
|
||||
content: string;
|
||||
/** Token usage */
|
||||
usage: {
|
||||
prompt_tokens: number;
|
||||
completion_tokens: number;
|
||||
total_tokens: number;
|
||||
};
|
||||
/** Generation time in ms */
|
||||
latencyMs: number;
|
||||
}
|
||||
|
||||
export interface ModelConfig {
|
||||
/** WebLLM model identifier */
|
||||
modelId: string;
|
||||
/** Human-readable name */
|
||||
displayName: string;
|
||||
/** Approximate download size in MB */
|
||||
downloadSizeMb: number;
|
||||
/** Approximate VRAM/RAM usage in MB */
|
||||
ramUsageMb: number;
|
||||
/** Default system prompt */
|
||||
defaultSystemPrompt?: string;
|
||||
}
|
||||
|
||||
export type LoadingStatus =
|
||||
| { state: 'idle' }
|
||||
| { state: 'checking' }
|
||||
| { state: 'downloading'; progress: number; text: string }
|
||||
| { state: 'loading'; text: string }
|
||||
| { state: 'ready' }
|
||||
| { state: 'error'; error: string };
|
||||
14
packages/local-llm/tsconfig.json
Normal file
14
packages/local-llm/tsconfig.json
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler",
|
||||
"lib": ["ES2022", "DOM"],
|
||||
"strict": true,
|
||||
"noEmit": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue