mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-27 06:37:43 +02:00
feat(web): PillNav bar mode, fullscreen, local STT + mic button
PillNav overhaul: - Dropdown-as-bar: theme/AI/sync/user menus render as horizontal bars in the bottom stack (PillDropdownBar) instead of floating popovers. New onOpenBar/activeBarId props on PillNavigation. - iconOnly pills: tags/search/workbench-tabs pills show only icons. Home pill removed. New iconOnly flag on PillNavItem. - Segmented toggle groups: items sharing a `group` id render as a single segmented pill (e.g. Light/Dark/System triple). - Fullscreen mode: press "f" to hide all bottom chrome, Esc to exit. - QuickInputBar + bottom bar visibility toggles via new pills. - Progress ring on AI trigger pill during model download (conic-gradient ::after, follows pill border-radius). @mana/local-stt — new package for browser-local speech-to-text: - Whisper models via transformers.js v4 (WebGPU + WASM fallback) - Same Web Worker architecture as @mana/local-llm - Two models: Whisper Tiny (150 MB) and Whisper Small (950 MB) - Reactive Svelte 5 bindings (getLocalSttStatus, loadLocalStt, transcribe) Voice-to-text integration: - useLocalStt() composable: mic capture via AudioContext + ScriptProcessor, resample to 16kHz mono, feed into Whisper worker - Mic button in QuickInputBar (leftAction slot) with recording/loading/transcribing states + pulse animation - Transcribed text injected into InputBar via new injectedText prop - STT model selector in AI bar alongside LLM tier controls Also: vite.config.ts server.fs.allow expanded to monorepo root so workspace package workers resolve in dev. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8c2f9306e9
commit
3deee755b3
24 changed files with 2145 additions and 28 deletions
231
packages/local-stt/src/engine-impl.ts
Normal file
231
packages/local-stt/src/engine-impl.ts
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
/**
|
||||
* LocalSttEngineImpl — the actual transformers.js Whisper engine.
|
||||
*
|
||||
* Runs inside a Web Worker (worker.ts). The main thread never
|
||||
* instantiates this directly — it talks to a thin proxy in engine.ts
|
||||
* that postMessages over to the worker.
|
||||
*
|
||||
* Whisper processes audio in 30-second chunks. For longer recordings
|
||||
* the pipeline handles chunking internally via `chunk_length_s`.
|
||||
* We expose pseudo-streaming by forwarding each chunk's text via
|
||||
* the onChunk callback as it completes.
|
||||
*/
|
||||
|
||||
import type {
|
||||
TranscribeOptions,
|
||||
TranscribeResult,
|
||||
TranscribeSegment,
|
||||
LoadingStatus,
|
||||
SttModelConfig,
|
||||
} from './types';
|
||||
import { MODELS, DEFAULT_MODEL, type ModelKey } from './models';
|
||||
|
||||
type TransformersModule = typeof import('@huggingface/transformers');
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
type AnyPipeline = any;
|
||||
|
||||
export class LocalSttEngineImpl {
|
||||
private pipeline: AnyPipeline = null;
|
||||
private transformers: TransformersModule | null = null;
|
||||
private loadPromise: Promise<void> | null = null;
|
||||
private currentModel: ModelKey | null = null;
|
||||
private _status: LoadingStatus = { state: 'idle' };
|
||||
private statusListeners: Set<(status: LoadingStatus) => void> = new Set();
|
||||
|
||||
get status(): LoadingStatus {
|
||||
return this._status;
|
||||
}
|
||||
|
||||
get isReady(): boolean {
|
||||
return this._status.state === 'ready';
|
||||
}
|
||||
|
||||
get modelConfig(): SttModelConfig | null {
|
||||
return this.currentModel ? MODELS[this.currentModel] : null;
|
||||
}
|
||||
|
||||
onStatusChange(listener: (status: LoadingStatus) => void): () => void {
|
||||
this.statusListeners.add(listener);
|
||||
return () => this.statusListeners.delete(listener);
|
||||
}
|
||||
|
||||
private setStatus(status: LoadingStatus) {
|
||||
this._status = status;
|
||||
for (const listener of this.statusListeners) {
|
||||
listener(status);
|
||||
}
|
||||
}
|
||||
|
||||
static isSupported(): boolean {
|
||||
return typeof navigator !== 'undefined' && 'gpu' in navigator;
|
||||
}
|
||||
|
||||
async load(model: ModelKey = DEFAULT_MODEL): Promise<void> {
|
||||
if (this.pipeline && this.currentModel === model) return;
|
||||
if (this.loadPromise && this.currentModel === model) return this.loadPromise;
|
||||
if (this.pipeline && this.currentModel !== model) {
|
||||
await this.unload();
|
||||
}
|
||||
this.currentModel = model;
|
||||
this.loadPromise = this._load(model);
|
||||
return this.loadPromise;
|
||||
}
|
||||
|
||||
private async _load(model: ModelKey): Promise<void> {
|
||||
this.setStatus({ state: 'checking' });
|
||||
|
||||
try {
|
||||
if (!this.transformers) {
|
||||
this.transformers = await import('@huggingface/transformers');
|
||||
}
|
||||
|
||||
const config = MODELS[model];
|
||||
|
||||
// Aggregated download progress tracking (same pattern as local-llm).
|
||||
const fileProgress = new Map<string, { loaded: number; total: number }>();
|
||||
|
||||
const formatBytes = (bytes: number): string => {
|
||||
if (bytes < 1024) return `${bytes} B`;
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
|
||||
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(0)} MB`;
|
||||
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
|
||||
};
|
||||
|
||||
const emitAggregate = () => {
|
||||
let totalLoaded = 0;
|
||||
let totalSize = 0;
|
||||
for (const { loaded, total } of fileProgress.values()) {
|
||||
totalLoaded += loaded;
|
||||
totalSize += total;
|
||||
}
|
||||
const pct = totalSize > 0 ? totalLoaded / totalSize : 0;
|
||||
this.setStatus({
|
||||
state: 'downloading',
|
||||
progress: pct,
|
||||
text:
|
||||
totalSize > 0
|
||||
? `Downloading model (${(pct * 100).toFixed(0)}%, ${formatBytes(totalLoaded)} / ${formatBytes(totalSize)})`
|
||||
: `Downloading model (${fileProgress.size} files queued)`,
|
||||
});
|
||||
};
|
||||
|
||||
const progressCallback = (report: {
|
||||
status: string;
|
||||
file?: string;
|
||||
name?: string;
|
||||
progress?: number;
|
||||
loaded?: number;
|
||||
total?: number;
|
||||
}) => {
|
||||
const file = report.file ?? report.name ?? '_unknown';
|
||||
if (report.status === 'initiate') {
|
||||
if (!fileProgress.has(file)) fileProgress.set(file, { loaded: 0, total: 0 });
|
||||
emitAggregate();
|
||||
} else if (report.status === 'download' || report.status === 'progress') {
|
||||
fileProgress.set(file, {
|
||||
loaded: report.loaded ?? 0,
|
||||
total: report.total ?? fileProgress.get(file)?.total ?? 0,
|
||||
});
|
||||
emitAggregate();
|
||||
} else if (report.status === 'done') {
|
||||
const existing = fileProgress.get(file);
|
||||
if (existing && existing.total > 0) {
|
||||
fileProgress.set(file, { loaded: existing.total, total: existing.total });
|
||||
}
|
||||
emitAggregate();
|
||||
}
|
||||
};
|
||||
|
||||
this.setStatus({ state: 'loading', text: 'Loading Whisper pipeline…' });
|
||||
|
||||
// Use transformers.js pipeline() API for automatic-speech-recognition.
|
||||
// This handles model + processor + tokenizer loading in one call.
|
||||
// Device selection: try WebGPU first, fall back to WASM.
|
||||
const device = LocalSttEngineImpl.isSupported() ? 'webgpu' : 'wasm';
|
||||
|
||||
this.pipeline = await this.transformers.pipeline(
|
||||
'automatic-speech-recognition',
|
||||
config.modelId,
|
||||
{
|
||||
dtype: config.dtype,
|
||||
device,
|
||||
progress_callback: progressCallback,
|
||||
}
|
||||
);
|
||||
|
||||
this.setStatus({ state: 'ready' });
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
this.setStatus({ state: 'error', error: message });
|
||||
this.loadPromise = null;
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
async unload(): Promise<void> {
|
||||
this.pipeline = null;
|
||||
this.currentModel = null;
|
||||
this.loadPromise = null;
|
||||
this.setStatus({ state: 'idle' });
|
||||
}
|
||||
|
||||
async transcribe(options: TranscribeOptions): Promise<TranscribeResult> {
|
||||
if (!this.pipeline) {
|
||||
await this.load();
|
||||
}
|
||||
|
||||
const start = performance.now();
|
||||
|
||||
// Build pipeline options.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const pipelineOpts: Record<string, any> = {
|
||||
// Chunk long audio into 30s segments with 5s stride overlap.
|
||||
chunk_length_s: 30,
|
||||
stride_length_s: 5,
|
||||
// Return timestamps if requested.
|
||||
return_timestamps: options.timestamps ? true : false,
|
||||
};
|
||||
|
||||
if (options.language) {
|
||||
pipelineOpts.language = options.language;
|
||||
}
|
||||
|
||||
// Callback for pseudo-streaming: transformers.js emits partial
|
||||
// results per chunk via the `chunk_callback` option.
|
||||
if (options.onChunk) {
|
||||
const onChunk = options.onChunk;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
pipelineOpts.chunk_callback = (chunk: any) => {
|
||||
if (chunk?.text) {
|
||||
onChunk(chunk.text);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Run the pipeline. Input is Float32Array of 16kHz mono PCM.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const output: any = await this.pipeline(options.audio, pipelineOpts);
|
||||
|
||||
const latencyMs = Math.round(performance.now() - start);
|
||||
|
||||
// Parse output — the pipeline returns { text, chunks? } for
|
||||
// automatic-speech-recognition with return_timestamps.
|
||||
const text: string = output.text ?? '';
|
||||
const language: string = options.language ?? 'auto';
|
||||
|
||||
let segments: TranscribeSegment[] | undefined;
|
||||
if (options.timestamps && output.chunks) {
|
||||
segments = output.chunks.map(
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(c: any) => ({
|
||||
start: c.timestamp?.[0] ?? 0,
|
||||
end: c.timestamp?.[1] ?? 0,
|
||||
text: c.text ?? '',
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
return { text, language, segments, latencyMs };
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue