mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 19:01:08 +02:00
feat(local-llm): swap WebLLM/Qwen for transformers.js + Gemma 4 E2B
Replace the entire @mana/local-llm engine with a transformers.js-based
implementation backed by Google's Gemma 4 E2B (released 2026-04-02).
The external API of LocalLLMEngine — load(), generate(), prompt(),
extractJson(), classify(), onStatusChange(), isSupported() — is
preserved 1:1, so the /llm-test page, the playground module, and the
Svelte 5 reactive bindings in svelte.svelte.ts need no changes
beyond updating the default model key.
Why the engine swap: MLC has not (and as of today still hasn't)
published Gemma 4 builds for WebLLM. The webml-community team and
HuggingFace's onnx-community already have Gemma 4 E2B running in
the browser via transformers.js + WebGPU, with a documented
Gemma4ForConditionalGeneration class shipped in @huggingface/transformers
v4.0.0. Going through the ONNX route gets us the latest Google model
six days after release instead of waiting on MLC compilation.
Trade-offs accepted (discussed before this commit):
- transformers.js is a more generic ONNX runtime, so per-token
throughput will be ~20-40% lower than WebLLM would deliver for the
same model size. For a 2B model on a modern WebGPU device that's
still well above interactive latency.
- The JS bundle gains ~2-3 MB (the ONNX runtime). Negligible compared
to the 500 MB model download.
- transformers.js v4 is brand new (released alongside Gemma 4) so the
Gemma4ForConditionalGeneration code path has very little battle
testing yet. The risk is partially offset by webml-community's
reference implementation.
What changed file by file:
- packages/local-llm/package.json: drop @mlc-ai/web-llm, add
@huggingface/transformers ^4.0.0; bump version 0.1.0 → 0.2.0; rewrite
description.
- packages/local-llm/src/types.ts: add `dtype` field to ModelConfig
('fp32' | 'fp16' | 'q8' | 'q4' | 'q4f16') so each model can request
the quantization that matches its uploaded ONNX shards.
- packages/local-llm/src/models.ts: replace the old Qwen 2.5 + Gemma 2
registry with a single `gemma-4-e2b` entry pointing at
onnx-community/gemma-4-E2B-it-ONNX with q4f16 quantization. Future
models can be added by appending entries — the /llm-test picker
reads MODELS dynamically and picks them up automatically.
- packages/local-llm/src/cache.ts: replace the WebLLM-specific
hasModelInCache helper with a generic Cache API probe that looks for
`https://huggingface.co/{model_id}/resolve/main/tokenizer.json` in
any open cache. tokenizer.json is small, downloaded first, and
always present, so its presence is a reliable proxy for "model has
been loaded before".
- packages/local-llm/src/engine.ts: full rewrite. Internally we now
hold a transformers.js model + processor pair (created via
AutoProcessor.from_pretrained + Gemma4ForConditionalGeneration.from_pretrained
with `device: 'webgpu'`), and translate our LoadingStatus union from
the library's `progress_callback` shape. generate() applies Gemma's
chat template via the processor, runs model.generate() with optional
TextStreamer for streaming, then slices the prompt tokens off the
output tensor to compute per-call usage. The convenience methods
(prompt, extractJson, classify) are unchanged because they only call
generate() under the hood.
- packages/local-llm/src/generate.ts and status.svelte.ts: deleted.
These were orphaned from a much earlier engine API (referenced
`getEngine()` / `subscribe()` / `LlmState` symbols that haven't
existed for a while) and were never re-exported from index.ts —
they only showed up because `tsc --noEmit` was crawling the src
tree. Their functionality lives in engine.ts + svelte.svelte.ts now.
- apps/mana/apps/web/package.json: swap the direct dep from
@mlc-ai/web-llm to @huggingface/transformers. This is the same
trick we used for the previous adapter-node externals warning —
having it as a direct dep makes adapter-node's Rollup pass treat
it as external automatically.
- apps/mana/apps/web/vite.config.ts: swap ssr.external entry from
@mlc-ai/web-llm to @huggingface/transformers. Add a comment
explaining the why so the next person doesn't wonder.
- apps/mana/apps/web/src/routes/(app)/llm-test/+page.svelte: change
the default selectedModel from 'qwen-2.5-1.5b' to 'gemma-4-e2b'.
All other model display strings come from the MODELS registry, so
this is the single hard-coded reference that needed updating.
- pnpm-lock.yaml: regenerated. Confirmed @mlc-ai/web-llm is gone (0
references) and @huggingface/transformers is in (4 references).
CSP: no header changes needed. We already opened connect-src for
huggingface.co + cdn-lfs.huggingface.co + raw.githubusercontent.com
when fixing the WebLLM blockers earlier today, and 'wasm-unsafe-eval'
is already in script-src — both transformers.js (ONNX runtime) and
WebLLM (MLC runtime) need that. If transformers.js spawns its
inference into a Web Worker via a blob URL we may need to add
`worker-src 'self' blob:` once we hit the first runtime test, but
the existing CSP should be enough for the synchronous path.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
83828e5a44
commit
1f26aa4f2f
11 changed files with 378 additions and 269 deletions
|
|
@ -1,8 +1,8 @@
|
|||
{
|
||||
"name": "@mana/local-llm",
|
||||
"version": "0.1.0",
|
||||
"version": "0.2.0",
|
||||
"private": true,
|
||||
"description": "Client-side LLM inference via WebLLM (Qwen 2.5 1.5B) with Svelte 5 reactive stores",
|
||||
"description": "Client-side LLM inference via transformers.js (Gemma 4 E2B, WebGPU) with Svelte 5 reactive stores",
|
||||
"main": "./src/index.ts",
|
||||
"types": "./src/index.ts",
|
||||
"exports": {
|
||||
|
|
@ -13,7 +13,7 @@
|
|||
"clean": "rm -rf dist"
|
||||
},
|
||||
"dependencies": {
|
||||
"@mlc-ai/web-llm": "^0.2.78"
|
||||
"@huggingface/transformers": "^4.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^24.10.1",
|
||||
|
|
|
|||
|
|
@ -1,12 +1,23 @@
|
|||
/**
|
||||
* Check if a model is cached in the browser's Cache API.
|
||||
* Wraps @mlc-ai/web-llm's hasModelInCache with a dynamic import
|
||||
* so it doesn't break SSR/Docker builds.
|
||||
* Check if a transformers.js model is already cached in the browser.
|
||||
*
|
||||
* transformers.js stores HuggingFace shards in the standard Cache API under a
|
||||
* named cache (default "transformers-cache"). We probe for the model's
|
||||
* tokenizer.json — it's tiny (~few KB), always present, and downloaded
|
||||
* first, so its presence is a reliable proxy for "this model has been
|
||||
* loaded at least once before".
|
||||
*/
|
||||
export async function hasModelInCache(modelId: string): Promise<boolean> {
|
||||
if (typeof caches === 'undefined') return false;
|
||||
try {
|
||||
const { hasModelInCache: check } = await import('@mlc-ai/web-llm');
|
||||
return await check(modelId);
|
||||
const cacheNames = await caches.keys();
|
||||
const url = `https://huggingface.co/${modelId}/resolve/main/tokenizer.json`;
|
||||
for (const name of cacheNames) {
|
||||
const cache = await caches.open(name);
|
||||
const match = await cache.match(url);
|
||||
if (match) return true;
|
||||
}
|
||||
return false;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,17 +1,34 @@
|
|||
/**
|
||||
* LocalLLMEngine — WebLLM wrapper for client-side inference.
|
||||
* LocalLLMEngine — transformers.js wrapper for client-side inference.
|
||||
*
|
||||
* Lazy-loads the model on first use, caches weights in browser Cache API.
|
||||
* Provides both one-shot and streaming generation.
|
||||
* Lazy-loads a HuggingFace ONNX model on first use, caches weights in the
|
||||
* browser's Cache API, and runs inference on the WebGPU backend.
|
||||
*
|
||||
* The default model is Google's Gemma 4 E2B (`onnx-community/gemma-4-E2B-it-ONNX`,
|
||||
* q4f16). The external API of this class is intentionally identical to the
|
||||
* previous WebLLM implementation so callers (Svelte stores, /llm-test page,
|
||||
* playground module) need no changes when the underlying engine swaps.
|
||||
*/
|
||||
|
||||
import type { MLCEngine } from '@mlc-ai/web-llm';
|
||||
import type { ChatMessage, GenerateOptions, GenerateResult, LoadingStatus } from './types';
|
||||
import type { ModelConfig } from './types';
|
||||
import { MODELS, DEFAULT_MODEL, type ModelKey } from './models';
|
||||
|
||||
// transformers.js types are minimal here on purpose. The library does not
|
||||
// publish first-class TS types for every model class, and we never expose
|
||||
// these objects past this file — the public surface (LocalLLMEngine methods)
|
||||
// is fully typed via our own GenerateResult / LoadingStatus etc.
|
||||
type TransformersModule = typeof import('@huggingface/transformers');
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
type AnyModel = any;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
type AnyProcessor = any;
|
||||
|
||||
export class LocalLLMEngine {
|
||||
private engine: MLCEngine | null = null;
|
||||
private model: AnyModel = null;
|
||||
private processor: AnyProcessor = null;
|
||||
private transformers: TransformersModule | null = null;
|
||||
private loadPromise: Promise<void> | null = null;
|
||||
private currentModel: ModelKey | null = null;
|
||||
private _status: LoadingStatus = { state: 'idle' };
|
||||
|
|
@ -53,17 +70,17 @@ export class LocalLLMEngine {
|
|||
|
||||
/**
|
||||
* Load a model. Idempotent — returns immediately if already loaded.
|
||||
* Model weights are cached in browser Cache API for instant reload.
|
||||
* Model weights are cached in the browser Cache API for instant reload.
|
||||
*/
|
||||
async load(model: ModelKey = DEFAULT_MODEL): Promise<void> {
|
||||
// Already loaded with this model
|
||||
if (this.engine && this.currentModel === model) return;
|
||||
if (this.model && this.currentModel === model) return;
|
||||
|
||||
// Already loading
|
||||
if (this.loadPromise && this.currentModel === model) return this.loadPromise;
|
||||
|
||||
// Unload previous model if switching
|
||||
if (this.engine && this.currentModel !== model) {
|
||||
if (this.model && this.currentModel !== model) {
|
||||
await this.unload();
|
||||
}
|
||||
|
||||
|
|
@ -81,21 +98,60 @@ export class LocalLLMEngine {
|
|||
this.setStatus({ state: 'checking' });
|
||||
|
||||
try {
|
||||
const { CreateMLCEngine } = await import('@mlc-ai/web-llm');
|
||||
if (!this.transformers) {
|
||||
this.transformers = await import('@huggingface/transformers');
|
||||
}
|
||||
const config = MODELS[model];
|
||||
|
||||
this.engine = await CreateMLCEngine(config.modelId, {
|
||||
initProgressCallback: (report) => {
|
||||
if (report.progress < 1) {
|
||||
this.setStatus({
|
||||
state: 'downloading',
|
||||
progress: report.progress,
|
||||
text: report.text,
|
||||
});
|
||||
} else {
|
||||
this.setStatus({ state: 'loading', text: 'Initializing model...' });
|
||||
}
|
||||
},
|
||||
// transformers.js progress callback shape:
|
||||
// { status: 'initiate'|'download'|'progress'|'done'|'ready',
|
||||
// name?: string, file?: string, progress?: number, loaded?: number, total?: number }
|
||||
// We collapse it into our LoadingStatus union.
|
||||
const progressCallback = (report: {
|
||||
status: string;
|
||||
file?: string;
|
||||
name?: string;
|
||||
progress?: number;
|
||||
loaded?: number;
|
||||
total?: number;
|
||||
}) => {
|
||||
const label = report.file ?? report.name ?? '';
|
||||
if (report.status === 'progress' || report.status === 'download') {
|
||||
const pct = typeof report.progress === 'number' ? report.progress : 0;
|
||||
this.setStatus({
|
||||
state: 'downloading',
|
||||
progress: pct / 100,
|
||||
text: label
|
||||
? `Downloading ${label} (${pct.toFixed(0)}%)`
|
||||
: `Downloading (${pct.toFixed(0)}%)`,
|
||||
});
|
||||
} else if (report.status === 'initiate') {
|
||||
this.setStatus({ state: 'downloading', progress: 0, text: `Starting ${label}` });
|
||||
} else if (report.status === 'done') {
|
||||
this.setStatus({ state: 'loading', text: label ? `Loaded ${label}` : 'Loaded shard' });
|
||||
}
|
||||
// 'ready' is handled below after both processor + model finish
|
||||
};
|
||||
|
||||
// AutoProcessor wraps tokenizer + image/audio preprocessors. For
|
||||
// our text-only chat path we use the wrapped tokenizer's
|
||||
// apply_chat_template, but loading the full processor is the
|
||||
// path the model card documents and avoids architecture-specific
|
||||
// special-casing.
|
||||
const { AutoProcessor, Gemma4ForConditionalGeneration } = this.transformers as unknown as {
|
||||
AutoProcessor: { from_pretrained(id: string, opts?: unknown): Promise<AnyProcessor> };
|
||||
Gemma4ForConditionalGeneration: {
|
||||
from_pretrained(id: string, opts?: unknown): Promise<AnyModel>;
|
||||
};
|
||||
};
|
||||
|
||||
this.processor = await AutoProcessor.from_pretrained(config.modelId, {
|
||||
progress_callback: progressCallback,
|
||||
});
|
||||
this.model = await Gemma4ForConditionalGeneration.from_pretrained(config.modelId, {
|
||||
dtype: config.dtype,
|
||||
device: 'webgpu',
|
||||
progress_callback: progressCallback,
|
||||
});
|
||||
|
||||
this.setStatus({ state: 'ready' });
|
||||
|
|
@ -108,13 +164,15 @@ export class LocalLLMEngine {
|
|||
}
|
||||
|
||||
/**
|
||||
* Unload the model and free memory.
|
||||
* Unload the model and free GPU memory.
|
||||
*/
|
||||
async unload(): Promise<void> {
|
||||
if (this.engine) {
|
||||
await this.engine.unload();
|
||||
this.engine = null;
|
||||
}
|
||||
// transformers.js doesn't expose an explicit dispose() yet — dropping
|
||||
// the references and letting the runtime/GC clean up is the
|
||||
// recommended path. The WebGPU buffers are tied to the model object
|
||||
// and get released when it's no longer reachable.
|
||||
this.model = null;
|
||||
this.processor = null;
|
||||
this.currentModel = null;
|
||||
this.loadPromise = null;
|
||||
this.setStatus({ state: 'idle' });
|
||||
|
|
@ -124,70 +182,85 @@ export class LocalLLMEngine {
|
|||
* Generate a response. Auto-loads the model if not yet loaded.
|
||||
*/
|
||||
async generate(options: GenerateOptions): Promise<GenerateResult> {
|
||||
if (!this.engine) {
|
||||
if (!this.model || !this.processor) {
|
||||
await this.load();
|
||||
}
|
||||
|
||||
const start = performance.now();
|
||||
|
||||
if (options.onToken) {
|
||||
return this._generateStreaming(options, start);
|
||||
}
|
||||
|
||||
const response = await this.engine!.chat.completions.create({
|
||||
messages: options.messages,
|
||||
temperature: options.temperature ?? 0.7,
|
||||
max_tokens: options.maxTokens ?? 1024,
|
||||
stream: false,
|
||||
// Apply Gemma's chat template via the processor's tokenizer wrapper.
|
||||
// `add_generation_prompt: true` appends the tokens that tell the model
|
||||
// "now generate an assistant turn".
|
||||
const inputs = await this.processor.apply_chat_template(options.messages, {
|
||||
add_generation_prompt: true,
|
||||
return_dict: true,
|
||||
return_tensor: 'pt',
|
||||
});
|
||||
|
||||
const promptTokenCount = this.tensorLength(inputs.input_ids);
|
||||
|
||||
// Streaming via TextStreamer if requested
|
||||
let streamer: unknown = undefined;
|
||||
if (options.onToken) {
|
||||
const transformers = this.transformers as TransformersModule;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const TextStreamer = (transformers as any).TextStreamer;
|
||||
streamer = new TextStreamer(this.processor.tokenizer, {
|
||||
skip_prompt: true,
|
||||
skip_special_tokens: true,
|
||||
callback_function: (text: string) => {
|
||||
options.onToken!(text);
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
const generated = await this.model.generate({
|
||||
...inputs,
|
||||
max_new_tokens: options.maxTokens ?? 1024,
|
||||
temperature: options.temperature ?? 0.7,
|
||||
do_sample: (options.temperature ?? 0.7) > 0,
|
||||
streamer,
|
||||
});
|
||||
|
||||
// `generated` is a tensor with shape [batch, seq_len_with_prompt].
|
||||
// We slice off the prompt portion to get just the new tokens.
|
||||
const fullSequence = this.tensorRow(generated, 0);
|
||||
const newTokens = fullSequence.slice(promptTokenCount);
|
||||
const completionTokenCount = newTokens.length;
|
||||
|
||||
const content: string = this.processor.tokenizer.decode(newTokens, {
|
||||
skip_special_tokens: true,
|
||||
});
|
||||
|
||||
const choice = response.choices[0];
|
||||
return {
|
||||
content: choice.message.content ?? '',
|
||||
content,
|
||||
usage: {
|
||||
prompt_tokens: response.usage?.prompt_tokens ?? 0,
|
||||
completion_tokens: response.usage?.completion_tokens ?? 0,
|
||||
total_tokens: response.usage?.total_tokens ?? 0,
|
||||
prompt_tokens: promptTokenCount,
|
||||
completion_tokens: completionTokenCount,
|
||||
total_tokens: promptTokenCount + completionTokenCount,
|
||||
},
|
||||
latencyMs: Math.round(performance.now() - start),
|
||||
};
|
||||
}
|
||||
|
||||
private async _generateStreaming(
|
||||
options: GenerateOptions,
|
||||
start: number
|
||||
): Promise<GenerateResult> {
|
||||
const chunks: string[] = [];
|
||||
let usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
|
||||
/**
|
||||
* Helper: extract the seq-length of a transformers.js Tensor.
|
||||
* The tensors expose `.dims` ([batch, seq_len]) and `.data` (TypedArray).
|
||||
*/
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
private tensorLength(tensor: any): number {
|
||||
if (!tensor || !tensor.dims) return 0;
|
||||
return tensor.dims[tensor.dims.length - 1];
|
||||
}
|
||||
|
||||
const stream = await this.engine!.chat.completions.create({
|
||||
messages: options.messages,
|
||||
temperature: options.temperature ?? 0.7,
|
||||
max_tokens: options.maxTokens ?? 1024,
|
||||
stream: true,
|
||||
stream_options: { include_usage: true },
|
||||
});
|
||||
|
||||
for await (const chunk of stream) {
|
||||
const delta = chunk.choices[0]?.delta?.content;
|
||||
if (delta) {
|
||||
chunks.push(delta);
|
||||
options.onToken!(delta);
|
||||
}
|
||||
if (chunk.usage) {
|
||||
usage = {
|
||||
prompt_tokens: chunk.usage.prompt_tokens,
|
||||
completion_tokens: chunk.usage.completion_tokens,
|
||||
total_tokens: chunk.usage.total_tokens,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
content: chunks.join(''),
|
||||
usage,
|
||||
latencyMs: Math.round(performance.now() - start),
|
||||
};
|
||||
/**
|
||||
* Helper: extract row N of a 2D tensor as a number array.
|
||||
*/
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
private tensorRow(tensor: any, row: number): number[] {
|
||||
const seqLen = tensor.dims[tensor.dims.length - 1];
|
||||
const start = row * seqLen;
|
||||
return Array.from(tensor.data.slice(start, start + seqLen)) as number[];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,112 +0,0 @@
|
|||
import { getEngine } from './engine.js';
|
||||
|
||||
export interface ChatMessage {
|
||||
role: 'system' | 'user' | 'assistant';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface GenerateOptions {
|
||||
messages: ChatMessage[];
|
||||
temperature?: number;
|
||||
maxTokens?: number;
|
||||
onToken?: (token: string) => void;
|
||||
}
|
||||
|
||||
export interface GenerateResult {
|
||||
content: string;
|
||||
latencyMs: number;
|
||||
usage: {
|
||||
prompt_tokens: number;
|
||||
completion_tokens: number;
|
||||
};
|
||||
}
|
||||
|
||||
export async function generate(options: GenerateOptions): Promise<GenerateResult> {
|
||||
const engine = getEngine();
|
||||
if (!engine) throw new Error('No model loaded. Call loadLocalLlm() first.');
|
||||
|
||||
const { messages, temperature = 0.7, maxTokens = 1024, onToken } = options;
|
||||
const start = performance.now();
|
||||
|
||||
const reply = await engine.chat.completions.create({
|
||||
messages,
|
||||
temperature,
|
||||
max_tokens: maxTokens,
|
||||
stream: !!onToken,
|
||||
stream_options: onToken ? { include_usage: true } : undefined,
|
||||
});
|
||||
|
||||
let content = '';
|
||||
let promptTokens = 0;
|
||||
let completionTokens = 0;
|
||||
|
||||
if (Symbol.asyncIterator in Object(reply)) {
|
||||
for await (const chunk of reply as AsyncIterable<any>) {
|
||||
const delta = chunk.choices?.[0]?.delta?.content;
|
||||
if (delta) {
|
||||
content += delta;
|
||||
onToken?.(delta);
|
||||
}
|
||||
if (chunk.usage) {
|
||||
promptTokens = chunk.usage.prompt_tokens ?? 0;
|
||||
completionTokens = chunk.usage.completion_tokens ?? 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const completion = reply as any;
|
||||
content = completion.choices?.[0]?.message?.content ?? '';
|
||||
promptTokens = completion.usage?.prompt_tokens ?? 0;
|
||||
completionTokens = completion.usage?.completion_tokens ?? 0;
|
||||
}
|
||||
|
||||
const latencyMs = Math.round(performance.now() - start);
|
||||
|
||||
return {
|
||||
content,
|
||||
latencyMs,
|
||||
usage: { prompt_tokens: promptTokens, completion_tokens: completionTokens },
|
||||
};
|
||||
}
|
||||
|
||||
export async function extractJson(text: string, instruction: string): Promise<unknown> {
|
||||
const result = await generate({
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content:
|
||||
'You are a JSON extraction assistant. Respond ONLY with valid JSON, no explanation or markdown.',
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: `${instruction}\n\nText:\n${text}`,
|
||||
},
|
||||
],
|
||||
temperature: 0.1,
|
||||
maxTokens: 2048,
|
||||
});
|
||||
|
||||
const jsonMatch = result.content.match(/[[{][\s\S]*[}\]]/);
|
||||
if (!jsonMatch) throw new Error('No JSON found in response');
|
||||
return JSON.parse(jsonMatch[0]);
|
||||
}
|
||||
|
||||
export async function classify(text: string, categories: string[]): Promise<string> {
|
||||
const result = await generate({
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: `You are a text classifier. Classify the text into exactly one of these categories: ${categories.join(', ')}. Respond with ONLY the category name, nothing else.`,
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: text,
|
||||
},
|
||||
],
|
||||
temperature: 0.1,
|
||||
maxTokens: 50,
|
||||
});
|
||||
|
||||
const response = result.content.trim().toLowerCase();
|
||||
const match = categories.find((c) => response.includes(c.toLowerCase()));
|
||||
return match ?? result.content.trim();
|
||||
}
|
||||
|
|
@ -2,40 +2,29 @@ import type { ModelConfig } from './types';
|
|||
|
||||
/**
|
||||
* Pre-configured models for client-side inference.
|
||||
* All models are quantized for browser use via WebLLM/MLC.
|
||||
*
|
||||
* All models are ONNX builds loaded via @huggingface/transformers (transformers.js)
|
||||
* with the WebGPU backend. The default is Google's Gemma 4 E2B — the smallest
|
||||
* member of the Gemma 4 family released 2026-04-02. E2B stands for "Effective 2B"
|
||||
* and is multimodal (text + image + audio) at the model level, but our chat-only
|
||||
* code path only ever passes text.
|
||||
*
|
||||
* Adding a new model: pick a HuggingFace ONNX repo (look on huggingface.co/onnx-community
|
||||
* for community-converted models, or huggingface.co/{org}/{repo}-ONNX for first-party
|
||||
* builds), confirm it has a `q4f16` quantization in its `onnx/` directory, and add an
|
||||
* entry below. The /llm-test page picks up new entries automatically.
|
||||
*/
|
||||
|
||||
export const MODELS = {
|
||||
/** Default model — fast, good at structured output, multilingual */
|
||||
'qwen-2.5-1.5b': {
|
||||
modelId: 'Qwen2.5-1.5B-Instruct-q4f16_1-MLC',
|
||||
displayName: 'Qwen 2.5 1.5B',
|
||||
downloadSizeMb: 1000,
|
||||
ramUsageMb: 1800,
|
||||
},
|
||||
/** Smaller variant for low-end devices */
|
||||
'qwen-2.5-0.5b': {
|
||||
modelId: 'Qwen2.5-0.5B-Instruct-q4f16_1-MLC',
|
||||
displayName: 'Qwen 2.5 0.5B',
|
||||
downloadSizeMb: 400,
|
||||
ramUsageMb: 800,
|
||||
},
|
||||
/** Google Gemma 2 — strong general-purpose model, similar size class to Qwen 1.5B */
|
||||
'gemma-2-2b': {
|
||||
modelId: 'gemma-2-2b-it-q4f16_1-MLC',
|
||||
displayName: 'Gemma 2 2B',
|
||||
downloadSizeMb: 1400,
|
||||
ramUsageMb: 2200,
|
||||
},
|
||||
/** Google Gemma 2 9B — much higher quality, needs a beefy GPU (~6GB VRAM) */
|
||||
'gemma-2-9b': {
|
||||
modelId: 'gemma-2-9b-it-q4f16_1-MLC',
|
||||
displayName: 'Gemma 2 9B',
|
||||
downloadSizeMb: 5300,
|
||||
ramUsageMb: 6500,
|
||||
'gemma-4-e2b': {
|
||||
modelId: 'onnx-community/gemma-4-E2B-it-ONNX',
|
||||
displayName: 'Gemma 4 E2B',
|
||||
dtype: 'q4f16',
|
||||
downloadSizeMb: 500,
|
||||
ramUsageMb: 1500,
|
||||
},
|
||||
} as const satisfies Record<string, ModelConfig>;
|
||||
|
||||
export type ModelKey = keyof typeof MODELS;
|
||||
|
||||
export const DEFAULT_MODEL: ModelKey = 'qwen-2.5-1.5b';
|
||||
export const DEFAULT_MODEL: ModelKey = 'gemma-4-e2b';
|
||||
|
|
|
|||
|
|
@ -1,22 +0,0 @@
|
|||
import { subscribe, type LlmState } from './engine.js';
|
||||
|
||||
/**
|
||||
* Reactive status wrapper for use in Svelte 5 components.
|
||||
* Returns an object with a `current` property that updates reactively.
|
||||
*/
|
||||
export function getLocalLlmStatus(): { current: LlmState } {
|
||||
let state = $state<LlmState>({ state: 'idle' });
|
||||
|
||||
$effect(() => {
|
||||
const unsub = subscribe((s) => {
|
||||
state = s;
|
||||
});
|
||||
return unsub;
|
||||
});
|
||||
|
||||
return {
|
||||
get current() {
|
||||
return state;
|
||||
},
|
||||
};
|
||||
}
|
||||
|
|
@ -33,10 +33,19 @@ export interface GenerateResult {
|
|||
}
|
||||
|
||||
export interface ModelConfig {
|
||||
/** WebLLM model identifier */
|
||||
/** HuggingFace ONNX repo id, e.g. "onnx-community/gemma-4-E2B-it-ONNX" */
|
||||
modelId: string;
|
||||
/** Human-readable name */
|
||||
displayName: string;
|
||||
/**
|
||||
* Quantization the transformers.js loader should request. Common values:
|
||||
* - "fp32" — full precision, biggest, only for tiny models
|
||||
* - "fp16" — half precision, ~50% smaller than fp32
|
||||
* - "q8" — 8-bit weights, fp32 activations
|
||||
* - "q4" — 4-bit weights, fp32 activations
|
||||
* - "q4f16" — 4-bit weights, fp16 activations (recommended for WebGPU)
|
||||
*/
|
||||
dtype: 'fp32' | 'fp16' | 'q8' | 'q4' | 'q4f16';
|
||||
/** Approximate download size in MB */
|
||||
downloadSizeMb: number;
|
||||
/** Approximate VRAM/RAM usage in MB */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue