mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-16 00:39:39 +02:00
PillNav overhaul: - Dropdown-as-bar: theme/AI/sync/user menus render as horizontal bars in the bottom stack (PillDropdownBar) instead of floating popovers. New onOpenBar/activeBarId props on PillNavigation. - iconOnly pills: tags/search/workbench-tabs pills show only icons. Home pill removed. New iconOnly flag on PillNavItem. - Segmented toggle groups: items sharing a `group` id render as a single segmented pill (e.g. Light/Dark/System triple). - Fullscreen mode: press "f" to hide all bottom chrome, Esc to exit. - QuickInputBar + bottom bar visibility toggles via new pills. - Progress ring on AI trigger pill during model download (conic-gradient ::after, follows pill border-radius). @mana/local-stt — new package for browser-local speech-to-text: - Whisper models via transformers.js v4 (WebGPU + WASM fallback) - Same Web Worker architecture as @mana/local-llm - Two models: Whisper Tiny (150 MB) and Whisper Small (950 MB) - Reactive Svelte 5 bindings (getLocalSttStatus, loadLocalStt, transcribe) Voice-to-text integration: - useLocalStt() composable: mic capture via AudioContext + ScriptProcessor, resample to 16kHz mono, feed into Whisper worker - Mic button in QuickInputBar (leftAction slot) with recording/loading/transcribing states + pulse animation - Transcribed text injected into InputBar via new injectedText prop - STT model selector in AI bar alongside LLM tier controls Also: vite.config.ts server.fs.allow expanded to monorepo root so workspace package workers resolve in dev. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
231 lines
7 KiB
TypeScript
231 lines
7 KiB
TypeScript
/**
|
|
* LocalSttEngineImpl — the actual transformers.js Whisper engine.
|
|
*
|
|
* Runs inside a Web Worker (worker.ts). The main thread never
|
|
* instantiates this directly — it talks to a thin proxy in engine.ts
|
|
* that postMessages over to the worker.
|
|
*
|
|
* Whisper processes audio in 30-second chunks. For longer recordings
|
|
* the pipeline handles chunking internally via `chunk_length_s`.
|
|
* We expose pseudo-streaming by forwarding each chunk's text via
|
|
* the onChunk callback as it completes.
|
|
*/
|
|
|
|
import type {
|
|
TranscribeOptions,
|
|
TranscribeResult,
|
|
TranscribeSegment,
|
|
LoadingStatus,
|
|
SttModelConfig,
|
|
} from './types';
|
|
import { MODELS, DEFAULT_MODEL, type ModelKey } from './models';
|
|
|
|
type TransformersModule = typeof import('@huggingface/transformers');
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
type AnyPipeline = any;
|
|
|
|
export class LocalSttEngineImpl {
|
|
private pipeline: AnyPipeline = null;
|
|
private transformers: TransformersModule | null = null;
|
|
private loadPromise: Promise<void> | null = null;
|
|
private currentModel: ModelKey | null = null;
|
|
private _status: LoadingStatus = { state: 'idle' };
|
|
private statusListeners: Set<(status: LoadingStatus) => void> = new Set();
|
|
|
|
get status(): LoadingStatus {
|
|
return this._status;
|
|
}
|
|
|
|
get isReady(): boolean {
|
|
return this._status.state === 'ready';
|
|
}
|
|
|
|
get modelConfig(): SttModelConfig | null {
|
|
return this.currentModel ? MODELS[this.currentModel] : null;
|
|
}
|
|
|
|
onStatusChange(listener: (status: LoadingStatus) => void): () => void {
|
|
this.statusListeners.add(listener);
|
|
return () => this.statusListeners.delete(listener);
|
|
}
|
|
|
|
private setStatus(status: LoadingStatus) {
|
|
this._status = status;
|
|
for (const listener of this.statusListeners) {
|
|
listener(status);
|
|
}
|
|
}
|
|
|
|
static isSupported(): boolean {
|
|
return typeof navigator !== 'undefined' && 'gpu' in navigator;
|
|
}
|
|
|
|
async load(model: ModelKey = DEFAULT_MODEL): Promise<void> {
|
|
if (this.pipeline && this.currentModel === model) return;
|
|
if (this.loadPromise && this.currentModel === model) return this.loadPromise;
|
|
if (this.pipeline && this.currentModel !== model) {
|
|
await this.unload();
|
|
}
|
|
this.currentModel = model;
|
|
this.loadPromise = this._load(model);
|
|
return this.loadPromise;
|
|
}
|
|
|
|
private async _load(model: ModelKey): Promise<void> {
|
|
this.setStatus({ state: 'checking' });
|
|
|
|
try {
|
|
if (!this.transformers) {
|
|
this.transformers = await import('@huggingface/transformers');
|
|
}
|
|
|
|
const config = MODELS[model];
|
|
|
|
// Aggregated download progress tracking (same pattern as local-llm).
|
|
const fileProgress = new Map<string, { loaded: number; total: number }>();
|
|
|
|
const formatBytes = (bytes: number): string => {
|
|
if (bytes < 1024) return `${bytes} B`;
|
|
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
|
|
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(0)} MB`;
|
|
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
|
|
};
|
|
|
|
const emitAggregate = () => {
|
|
let totalLoaded = 0;
|
|
let totalSize = 0;
|
|
for (const { loaded, total } of fileProgress.values()) {
|
|
totalLoaded += loaded;
|
|
totalSize += total;
|
|
}
|
|
const pct = totalSize > 0 ? totalLoaded / totalSize : 0;
|
|
this.setStatus({
|
|
state: 'downloading',
|
|
progress: pct,
|
|
text:
|
|
totalSize > 0
|
|
? `Downloading model (${(pct * 100).toFixed(0)}%, ${formatBytes(totalLoaded)} / ${formatBytes(totalSize)})`
|
|
: `Downloading model (${fileProgress.size} files queued)`,
|
|
});
|
|
};
|
|
|
|
const progressCallback = (report: {
|
|
status: string;
|
|
file?: string;
|
|
name?: string;
|
|
progress?: number;
|
|
loaded?: number;
|
|
total?: number;
|
|
}) => {
|
|
const file = report.file ?? report.name ?? '_unknown';
|
|
if (report.status === 'initiate') {
|
|
if (!fileProgress.has(file)) fileProgress.set(file, { loaded: 0, total: 0 });
|
|
emitAggregate();
|
|
} else if (report.status === 'download' || report.status === 'progress') {
|
|
fileProgress.set(file, {
|
|
loaded: report.loaded ?? 0,
|
|
total: report.total ?? fileProgress.get(file)?.total ?? 0,
|
|
});
|
|
emitAggregate();
|
|
} else if (report.status === 'done') {
|
|
const existing = fileProgress.get(file);
|
|
if (existing && existing.total > 0) {
|
|
fileProgress.set(file, { loaded: existing.total, total: existing.total });
|
|
}
|
|
emitAggregate();
|
|
}
|
|
};
|
|
|
|
this.setStatus({ state: 'loading', text: 'Loading Whisper pipeline…' });
|
|
|
|
// Use transformers.js pipeline() API for automatic-speech-recognition.
|
|
// This handles model + processor + tokenizer loading in one call.
|
|
// Device selection: try WebGPU first, fall back to WASM.
|
|
const device = LocalSttEngineImpl.isSupported() ? 'webgpu' : 'wasm';
|
|
|
|
this.pipeline = await this.transformers.pipeline(
|
|
'automatic-speech-recognition',
|
|
config.modelId,
|
|
{
|
|
dtype: config.dtype,
|
|
device,
|
|
progress_callback: progressCallback,
|
|
}
|
|
);
|
|
|
|
this.setStatus({ state: 'ready' });
|
|
} catch (err) {
|
|
const message = err instanceof Error ? err.message : String(err);
|
|
this.setStatus({ state: 'error', error: message });
|
|
this.loadPromise = null;
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
async unload(): Promise<void> {
|
|
this.pipeline = null;
|
|
this.currentModel = null;
|
|
this.loadPromise = null;
|
|
this.setStatus({ state: 'idle' });
|
|
}
|
|
|
|
async transcribe(options: TranscribeOptions): Promise<TranscribeResult> {
|
|
if (!this.pipeline) {
|
|
await this.load();
|
|
}
|
|
|
|
const start = performance.now();
|
|
|
|
// Build pipeline options.
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
const pipelineOpts: Record<string, any> = {
|
|
// Chunk long audio into 30s segments with 5s stride overlap.
|
|
chunk_length_s: 30,
|
|
stride_length_s: 5,
|
|
// Return timestamps if requested.
|
|
return_timestamps: options.timestamps ? true : false,
|
|
};
|
|
|
|
if (options.language) {
|
|
pipelineOpts.language = options.language;
|
|
}
|
|
|
|
// Callback for pseudo-streaming: transformers.js emits partial
|
|
// results per chunk via the `chunk_callback` option.
|
|
if (options.onChunk) {
|
|
const onChunk = options.onChunk;
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
pipelineOpts.chunk_callback = (chunk: any) => {
|
|
if (chunk?.text) {
|
|
onChunk(chunk.text);
|
|
}
|
|
};
|
|
}
|
|
|
|
// Run the pipeline. Input is Float32Array of 16kHz mono PCM.
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
const output: any = await this.pipeline(options.audio, pipelineOpts);
|
|
|
|
const latencyMs = Math.round(performance.now() - start);
|
|
|
|
// Parse output — the pipeline returns { text, chunks? } for
|
|
// automatic-speech-recognition with return_timestamps.
|
|
const text: string = output.text ?? '';
|
|
const language: string = options.language ?? 'auto';
|
|
|
|
let segments: TranscribeSegment[] | undefined;
|
|
if (options.timestamps && output.chunks) {
|
|
segments = output.chunks.map(
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
(c: any) => ({
|
|
start: c.timestamp?.[0] ?? 0,
|
|
end: c.timestamp?.[1] ?? 0,
|
|
text: c.text ?? '',
|
|
})
|
|
);
|
|
}
|
|
|
|
return { text, language, segments, latencyMs };
|
|
}
|
|
}
|