feat(web): PillNav bar mode, fullscreen, local STT + mic button

PillNav overhaul: - Dropdown-as-bar: theme/AI/sync/user menus render as horizontal bars in the bottom stack (PillDropdownBar) instead of floating popovers. New onOpenBar/activeBarId props on PillNavigation. - iconOnly pills: tags/search/workbench-tabs pills show only icons. Home pill removed. New iconOnly flag on PillNavItem. - Segmented toggle groups: items sharing a `group` id render as a single segmented pill (e.g. Light/Dark/System triple). - Fullscreen mode: press "f" to hide all bottom chrome, Esc to exit. - QuickInputBar + bottom bar visibility toggles via new pills. - Progress ring on AI trigger pill during model download (conic-gradient ::after, follows pill border-radius). @mana/local-stt — new package for browser-local speech-to-text: - Whisper models via transformers.js v4 (WebGPU + WASM fallback) - Same Web Worker architecture as @mana/local-llm - Two models: Whisper Tiny (150 MB) and Whisper Small (950 MB) - Reactive Svelte 5 bindings (getLocalSttStatus, loadLocalStt, transcribe) Voice-to-text integration: - useLocalStt() composable: mic capture via AudioContext + ScriptProcessor, resample to 16kHz mono, feed into Whisper worker - Mic button in QuickInputBar (leftAction slot) with recording/loading/transcribing states + pulse animation - Transcribed text injected into InputBar via new injectedText prop - STT model selector in AI bar alongside LLM tier controls Also: vite.config.ts server.fs.allow expanded to monorepo root so workspace package workers resolve in dev. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-27 06:37:43 +02:00 · 2026-04-12 16:05:43 +02:00 · 2026-04-12 16:05:43 +02:00 · 3deee755b3
commit 3deee755b3
parent 8c2f9306e9
24 changed files with 2145 additions and 28 deletions
--- a/packages/local-stt/src/engine-impl.ts
+++ b/packages/local-stt/src/engine-impl.ts
@ -0,0 +1,231 @@
+/**
+ * LocalSttEngineImpl — the actual transformers.js Whisper engine.
+ *
+ * Runs inside a Web Worker (worker.ts). The main thread never
+ * instantiates this directly — it talks to a thin proxy in engine.ts
+ * that postMessages over to the worker.
+ *
+ * Whisper processes audio in 30-second chunks. For longer recordings
+ * the pipeline handles chunking internally via `chunk_length_s`.
+ * We expose pseudo-streaming by forwarding each chunk's text via
+ * the onChunk callback as it completes.
+ */
+
+import type {
+	TranscribeOptions,
+	TranscribeResult,
+	TranscribeSegment,
+	LoadingStatus,
+	SttModelConfig,
+} from './types';
+import { MODELS, DEFAULT_MODEL, type ModelKey } from './models';
+
+type TransformersModule = typeof import('@huggingface/transformers');
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+type AnyPipeline = any;
+
+export class LocalSttEngineImpl {
+	private pipeline: AnyPipeline = null;
+	private transformers: TransformersModule | null = null;
+	private loadPromise: Promise<void> | null = null;
+	private currentModel: ModelKey | null = null;
+	private _status: LoadingStatus = { state: 'idle' };
+	private statusListeners: Set<(status: LoadingStatus) => void> = new Set();
+
+	get status(): LoadingStatus {
+		return this._status;
+	}
+
+	get isReady(): boolean {
+		return this._status.state === 'ready';
+	}
+
+	get modelConfig(): SttModelConfig | null {
+		return this.currentModel ? MODELS[this.currentModel] : null;
+	}
+
+	onStatusChange(listener: (status: LoadingStatus) => void): () => void {
+		this.statusListeners.add(listener);
+		return () => this.statusListeners.delete(listener);
+	}
+
+	private setStatus(status: LoadingStatus) {
+		this._status = status;
+		for (const listener of this.statusListeners) {
+			listener(status);
+		}
+	}
+
+	static isSupported(): boolean {
+		return typeof navigator !== 'undefined' && 'gpu' in navigator;
+	}
+
+	async load(model: ModelKey = DEFAULT_MODEL): Promise<void> {
+		if (this.pipeline && this.currentModel === model) return;
+		if (this.loadPromise && this.currentModel === model) return this.loadPromise;
+		if (this.pipeline && this.currentModel !== model) {
+			await this.unload();
+		}
+		this.currentModel = model;
+		this.loadPromise = this._load(model);
+		return this.loadPromise;
+	}
+
+	private async _load(model: ModelKey): Promise<void> {
+		this.setStatus({ state: 'checking' });
+
+		try {
+			if (!this.transformers) {
+				this.transformers = await import('@huggingface/transformers');
+			}
+
+			const config = MODELS[model];
+
+			// Aggregated download progress tracking (same pattern as local-llm).
+			const fileProgress = new Map<string, { loaded: number; total: number }>();
+
+			const formatBytes = (bytes: number): string => {
+				if (bytes < 1024) return `${bytes} B`;
+				if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
+				if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(0)} MB`;
+				return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
+			};
+
+			const emitAggregate = () => {
+				let totalLoaded = 0;
+				let totalSize = 0;
+				for (const { loaded, total } of fileProgress.values()) {
+					totalLoaded += loaded;
+					totalSize += total;
+				}
+				const pct = totalSize > 0 ? totalLoaded / totalSize : 0;
+				this.setStatus({
+					state: 'downloading',
+					progress: pct,
+					text:
+						totalSize > 0
+							? `Downloading model (${(pct * 100).toFixed(0)}%, ${formatBytes(totalLoaded)} / ${formatBytes(totalSize)})`
+							: `Downloading model (${fileProgress.size} files queued)`,
+				});
+			};
+
+			const progressCallback = (report: {
+				status: string;
+				file?: string;
+				name?: string;
+				progress?: number;
+				loaded?: number;
+				total?: number;
+			}) => {
+				const file = report.file ?? report.name ?? '_unknown';
+				if (report.status === 'initiate') {
+					if (!fileProgress.has(file)) fileProgress.set(file, { loaded: 0, total: 0 });
+					emitAggregate();
+				} else if (report.status === 'download' || report.status === 'progress') {
+					fileProgress.set(file, {
+						loaded: report.loaded ?? 0,
+						total: report.total ?? fileProgress.get(file)?.total ?? 0,
+					});
+					emitAggregate();
+				} else if (report.status === 'done') {
+					const existing = fileProgress.get(file);
+					if (existing && existing.total > 0) {
+						fileProgress.set(file, { loaded: existing.total, total: existing.total });
+					}
+					emitAggregate();
+				}
+			};
+
+			this.setStatus({ state: 'loading', text: 'Loading Whisper pipeline…' });
+
+			// Use transformers.js pipeline() API for automatic-speech-recognition.
+			// This handles model + processor + tokenizer loading in one call.
+			// Device selection: try WebGPU first, fall back to WASM.
+			const device = LocalSttEngineImpl.isSupported() ? 'webgpu' : 'wasm';
+
+			this.pipeline = await this.transformers.pipeline(
+				'automatic-speech-recognition',
+				config.modelId,
+				{
+					dtype: config.dtype,
+					device,
+					progress_callback: progressCallback,
+				}
+			);
+
+			this.setStatus({ state: 'ready' });
+		} catch (err) {
+			const message = err instanceof Error ? err.message : String(err);
+			this.setStatus({ state: 'error', error: message });
+			this.loadPromise = null;
+			throw err;
+		}
+	}
+
+	async unload(): Promise<void> {
+		this.pipeline = null;
+		this.currentModel = null;
+		this.loadPromise = null;
+		this.setStatus({ state: 'idle' });
+	}
+
+	async transcribe(options: TranscribeOptions): Promise<TranscribeResult> {
+		if (!this.pipeline) {
+			await this.load();
+		}
+
+		const start = performance.now();
+
+		// Build pipeline options.
+		// eslint-disable-next-line @typescript-eslint/no-explicit-any
+		const pipelineOpts: Record<string, any> = {
+			// Chunk long audio into 30s segments with 5s stride overlap.
+			chunk_length_s: 30,
+			stride_length_s: 5,
+			// Return timestamps if requested.
+			return_timestamps: options.timestamps ? true : false,
+		};
+
+		if (options.language) {
+			pipelineOpts.language = options.language;
+		}
+
+		// Callback for pseudo-streaming: transformers.js emits partial
+		// results per chunk via the `chunk_callback` option.
+		if (options.onChunk) {
+			const onChunk = options.onChunk;
+			// eslint-disable-next-line @typescript-eslint/no-explicit-any
+			pipelineOpts.chunk_callback = (chunk: any) => {
+				if (chunk?.text) {
+					onChunk(chunk.text);
+				}
+			};
+		}
+
+		// Run the pipeline. Input is Float32Array of 16kHz mono PCM.
+		// eslint-disable-next-line @typescript-eslint/no-explicit-any
+		const output: any = await this.pipeline(options.audio, pipelineOpts);
+
+		const latencyMs = Math.round(performance.now() - start);
+
+		// Parse output — the pipeline returns { text, chunks? } for
+		// automatic-speech-recognition with return_timestamps.
+		const text: string = output.text ?? '';
+		const language: string = options.language ?? 'auto';
+
+		let segments: TranscribeSegment[] | undefined;
+		if (options.timestamps && output.chunks) {
+			segments = output.chunks.map(
+				// eslint-disable-next-line @typescript-eslint/no-explicit-any
+				(c: any) => ({
+					start: c.timestamp?.[0] ?? 0,
+					end: c.timestamp?.[1] ?? 0,
+					text: c.text ?? '',
+				})
+			);
+		}
+
+		return { text, language, segments, latencyMs };
+	}
+}