mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-20 06:23:40 +02:00
feat(web): PillNav bar mode, fullscreen, local STT + mic button
PillNav overhaul: - Dropdown-as-bar: theme/AI/sync/user menus render as horizontal bars in the bottom stack (PillDropdownBar) instead of floating popovers. New onOpenBar/activeBarId props on PillNavigation. - iconOnly pills: tags/search/workbench-tabs pills show only icons. Home pill removed. New iconOnly flag on PillNavItem. - Segmented toggle groups: items sharing a `group` id render as a single segmented pill (e.g. Light/Dark/System triple). - Fullscreen mode: press "f" to hide all bottom chrome, Esc to exit. - QuickInputBar + bottom bar visibility toggles via new pills. - Progress ring on AI trigger pill during model download (conic-gradient ::after, follows pill border-radius). @mana/local-stt — new package for browser-local speech-to-text: - Whisper models via transformers.js v4 (WebGPU + WASM fallback) - Same Web Worker architecture as @mana/local-llm - Two models: Whisper Tiny (150 MB) and Whisper Small (950 MB) - Reactive Svelte 5 bindings (getLocalSttStatus, loadLocalStt, transcribe) Voice-to-text integration: - useLocalStt() composable: mic capture via AudioContext + ScriptProcessor, resample to 16kHz mono, feed into Whisper worker - Mic button in QuickInputBar (leftAction slot) with recording/loading/transcribing states + pulse animation - Transcribed text injected into InputBar via new injectedText prop - STT model selector in AI bar alongside LLM tier controls Also: vite.config.ts server.fs.allow expanded to monorepo root so workspace package workers resolve in dev. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8c2f9306e9
commit
3deee755b3
24 changed files with 2145 additions and 28 deletions
|
|
@ -160,6 +160,7 @@ pnpm test:e2e # Playwright
|
|||
- **Auth**: Mana Auth (Better Auth + EdDSA JWT) via `@mana/shared-auth`
|
||||
- **Data**: Dexie.js (local-first) + mana-sync (Go) backend
|
||||
- **Encryption**: AES-GCM-256 via Web Crypto, server-wrapped MK with optional zero-knowledge
|
||||
- **Local AI**: `@mana/local-llm` (Gemma 4 E2B, WebGPU) + `@mana/local-stt` (Whisper, WebGPU) — both run entirely in-browser via transformers.js
|
||||
- **Testing**: Vitest, Playwright
|
||||
- **Mobile**: Expo, Expo Router, NativeWind, EAS Build
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@
|
|||
"@mana/help": "workspace:*",
|
||||
"@mana/local-llm": "workspace:*",
|
||||
"@mana/local-store": "workspace:*",
|
||||
"@mana/local-stt": "workspace:^",
|
||||
"@mana/qr-export": "workspace:*",
|
||||
"@mana/shared-auth": "workspace:*",
|
||||
"@mana/shared-auth-ui": "workspace:*",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,288 @@
|
|||
/**
|
||||
* useLocalStt() — Svelte 5 composable that wires microphone capture
|
||||
* directly into @mana/local-stt for fully on-device speech-to-text.
|
||||
*
|
||||
* Usage:
|
||||
* const stt = useLocalStt();
|
||||
* // stt.state — 'idle' | 'loading' | 'recording' | 'transcribing'
|
||||
* // stt.text — transcribed text (accumulates across chunks)
|
||||
* // stt.error — error message or null
|
||||
* // stt.modelStatus — LoadingStatus from local-stt
|
||||
* // stt.toggle() — start recording or stop + transcribe
|
||||
* // stt.cancel() — abort recording without transcribing
|
||||
*
|
||||
* Audio pipeline:
|
||||
* getUserMedia (native sample rate)
|
||||
* → AudioContext + ScriptProcessor → collect Float32 chunks
|
||||
* → on stop: merge + resample to 16 kHz mono
|
||||
* → feed into local-stt transcribe()
|
||||
*
|
||||
* The model is loaded lazily on first toggle(). Subsequent calls skip
|
||||
* the download. The model stays loaded for the session (same as local-llm).
|
||||
*/
|
||||
|
||||
import { getLocalSttStatus, loadLocalStt, transcribe, isLocalSttSupported } from '@mana/local-stt';
|
||||
import type { LoadingStatus } from '@mana/local-stt';
|
||||
|
||||
export type SttState = 'idle' | 'loading' | 'recording' | 'transcribing';
|
||||
|
||||
export interface LocalSttHandle {
|
||||
/** Current state of the STT pipeline */
|
||||
readonly state: SttState;
|
||||
/** Transcribed text (updated after transcription completes) */
|
||||
readonly text: string;
|
||||
/** Partial/streaming text (updated per chunk during transcription) */
|
||||
readonly partial: string;
|
||||
/** Error message or null */
|
||||
readonly error: string | null;
|
||||
/** Model loading status from @mana/local-stt */
|
||||
readonly modelStatus: LoadingStatus;
|
||||
/** Elapsed recording time in ms */
|
||||
readonly elapsedMs: number;
|
||||
/** Whether WebGPU/WASM STT is supported */
|
||||
readonly isSupported: boolean;
|
||||
/** Start recording (loads model first if needed) or stop + transcribe */
|
||||
toggle: () => void;
|
||||
/** Cancel recording without transcribing */
|
||||
cancel: () => void;
|
||||
}
|
||||
|
||||
export function useLocalStt(options?: { language?: string }): LocalSttHandle {
|
||||
let state = $state<SttState>('idle');
|
||||
let text = $state('');
|
||||
let partial = $state('');
|
||||
let error = $state<string | null>(null);
|
||||
let elapsedMs = $state(0);
|
||||
|
||||
const modelStatus = getLocalSttStatus();
|
||||
const supported = isLocalSttSupported();
|
||||
|
||||
// Audio capture state (not reactive — internal only)
|
||||
let stream: MediaStream | null = null;
|
||||
let audioContext: AudioContext | null = null;
|
||||
let chunks: Float32Array[] = [];
|
||||
let sampleRate = 0;
|
||||
let tickHandle: ReturnType<typeof setInterval> | null = null;
|
||||
let startedAt = 0;
|
||||
|
||||
// ScriptProcessorNode is deprecated but universally supported and
|
||||
// simpler than AudioWorklet for our use case (we just collect raw
|
||||
// samples, no real-time processing). AudioWorklet requires a
|
||||
// separate module URL which complicates bundling.
|
||||
let scriptNode: ScriptProcessorNode | null = null;
|
||||
|
||||
function cleanup() {
|
||||
if (tickHandle !== null) {
|
||||
clearInterval(tickHandle);
|
||||
tickHandle = null;
|
||||
}
|
||||
scriptNode?.disconnect();
|
||||
scriptNode = null;
|
||||
stream?.getTracks().forEach((t) => t.stop());
|
||||
stream = null;
|
||||
if (audioContext && audioContext.state !== 'closed') {
|
||||
audioContext.close().catch(() => {});
|
||||
}
|
||||
audioContext = null;
|
||||
chunks = [];
|
||||
sampleRate = 0;
|
||||
elapsedMs = 0;
|
||||
}
|
||||
|
||||
async function startRecording() {
|
||||
error = null;
|
||||
text = '';
|
||||
partial = '';
|
||||
|
||||
// Ensure model is loaded first
|
||||
if (modelStatus.current.state !== 'ready') {
|
||||
state = 'loading';
|
||||
try {
|
||||
await loadLocalStt();
|
||||
} catch (e) {
|
||||
error = e instanceof Error ? e.message : String(e);
|
||||
state = 'idle';
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Get microphone access
|
||||
state = 'recording';
|
||||
try {
|
||||
stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
autoGainControl: true,
|
||||
},
|
||||
});
|
||||
} catch (e) {
|
||||
error = explainMicError(e);
|
||||
state = 'idle';
|
||||
return;
|
||||
}
|
||||
|
||||
// Set up AudioContext to capture raw PCM
|
||||
audioContext = new AudioContext();
|
||||
sampleRate = audioContext.sampleRate;
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
|
||||
// Buffer size 4096 is a good balance between latency and overhead
|
||||
scriptNode = audioContext.createScriptProcessor(4096, 1, 1);
|
||||
scriptNode.onaudioprocess = (e) => {
|
||||
const input = e.inputBuffer.getChannelData(0);
|
||||
// Copy — the buffer is reused by the browser
|
||||
chunks.push(new Float32Array(input));
|
||||
};
|
||||
source.connect(scriptNode);
|
||||
scriptNode.connect(audioContext.destination);
|
||||
|
||||
startedAt = Date.now();
|
||||
tickHandle = setInterval(() => {
|
||||
elapsedMs = Date.now() - startedAt;
|
||||
}, 100);
|
||||
}
|
||||
|
||||
async function stopAndTranscribe() {
|
||||
if (state !== 'recording') return;
|
||||
|
||||
// Stop recording
|
||||
const capturedChunks = [...chunks];
|
||||
const capturedRate = sampleRate;
|
||||
cleanup();
|
||||
|
||||
console.log(
|
||||
'[local-stt] Captured',
|
||||
capturedChunks.length,
|
||||
'chunks, sample rate:',
|
||||
capturedRate
|
||||
);
|
||||
|
||||
if (capturedChunks.length === 0) {
|
||||
error = 'Keine Audiodaten aufgenommen.';
|
||||
console.warn('[local-stt] No audio chunks captured');
|
||||
state = 'idle';
|
||||
return;
|
||||
}
|
||||
|
||||
state = 'transcribing';
|
||||
|
||||
try {
|
||||
// Merge chunks into one Float32Array
|
||||
const totalLength = capturedChunks.reduce((sum, c) => sum + c.length, 0);
|
||||
const merged = new Float32Array(totalLength);
|
||||
let offset = 0;
|
||||
for (const chunk of capturedChunks) {
|
||||
merged.set(chunk, offset);
|
||||
offset += chunk.length;
|
||||
}
|
||||
|
||||
// Resample to 16 kHz if needed
|
||||
const audio = capturedRate === 16000 ? merged : resample(merged, capturedRate, 16000);
|
||||
|
||||
const durationSec = audio.length / 16000;
|
||||
console.log('[local-stt] Audio ready:', {
|
||||
originalSamples: merged.length,
|
||||
resampledSamples: audio.length,
|
||||
durationSec: durationSec.toFixed(1),
|
||||
sampleRate: capturedRate,
|
||||
maxAmplitude: Math.max(...Array.from(audio.slice(0, 16000)).map(Math.abs)),
|
||||
});
|
||||
|
||||
const result = await transcribe({
|
||||
audio,
|
||||
language: options?.language,
|
||||
onChunk: (t: string) => {
|
||||
partial += t;
|
||||
console.log('[local-stt] Chunk:', t);
|
||||
},
|
||||
});
|
||||
|
||||
console.log('[local-stt] Result:', result);
|
||||
text = result.text.trim();
|
||||
} catch (e) {
|
||||
error = e instanceof Error ? e.message : String(e);
|
||||
console.error('[local-stt] Transcription error:', e);
|
||||
}
|
||||
|
||||
state = 'idle';
|
||||
}
|
||||
|
||||
function toggle() {
|
||||
if (state === 'idle') {
|
||||
startRecording();
|
||||
} else if (state === 'recording') {
|
||||
stopAndTranscribe();
|
||||
}
|
||||
// If loading or transcribing, ignore
|
||||
}
|
||||
|
||||
function cancel() {
|
||||
cleanup();
|
||||
state = 'idle';
|
||||
}
|
||||
|
||||
return {
|
||||
get state() {
|
||||
return state;
|
||||
},
|
||||
get text() {
|
||||
return text;
|
||||
},
|
||||
get partial() {
|
||||
return partial;
|
||||
},
|
||||
get error() {
|
||||
return error;
|
||||
},
|
||||
get modelStatus() {
|
||||
return modelStatus.current;
|
||||
},
|
||||
get elapsedMs() {
|
||||
return elapsedMs;
|
||||
},
|
||||
get isSupported() {
|
||||
return supported;
|
||||
},
|
||||
toggle,
|
||||
cancel,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Linear resample from sourceSampleRate to targetSampleRate.
|
||||
* Simple and good enough for speech — no need for a polyphase filter.
|
||||
*/
|
||||
function resample(input: Float32Array, fromRate: number, toRate: number): Float32Array {
|
||||
if (fromRate === toRate) return input;
|
||||
const ratio = fromRate / toRate;
|
||||
const outputLength = Math.round(input.length / ratio);
|
||||
const output = new Float32Array(outputLength);
|
||||
for (let i = 0; i < outputLength; i++) {
|
||||
const srcIndex = i * ratio;
|
||||
const lo = Math.floor(srcIndex);
|
||||
const hi = Math.min(lo + 1, input.length - 1);
|
||||
const frac = srcIndex - lo;
|
||||
output[i] = input[lo] * (1 - frac) + input[hi] * frac;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
function explainMicError(e: unknown): string {
|
||||
const err = e instanceof Error ? e : new Error(String(e));
|
||||
const name = err.name || '';
|
||||
const msg = err.message || '';
|
||||
|
||||
if (name === 'NotAllowedError' || /denied|permission/i.test(msg)) {
|
||||
return 'Mikrofon-Zugriff verweigert. Erlaube den Zugriff in deinen Browser-Einstellungen.';
|
||||
}
|
||||
if (name === 'NotFoundError' || /not.?found|no.?device/i.test(msg)) {
|
||||
return 'Kein Mikrofon gefunden.';
|
||||
}
|
||||
if (name === 'NotReadableError' || /in use|busy/i.test(msg)) {
|
||||
return 'Mikrofon ist gerade belegt.';
|
||||
}
|
||||
return `Mikrofon-Fehler: ${msg || name || 'Unbekannt'}`;
|
||||
}
|
||||
|
|
@ -210,14 +210,17 @@
|
|||
let aiTierItems = $derived<PillDropdownItem[]>([
|
||||
// Tier toggles — browser tier item and its model-status buddy share a
|
||||
// group so PillDropdownBar renders them as a paired pill.
|
||||
...TIER_TOGGLE_LIST.filter((t) => t.tier !== 'browser' || webgpuSupported).map((t) => ({
|
||||
id: `ai-tier-${t.tier}`,
|
||||
label: t.shortLabel,
|
||||
icon: t.icon,
|
||||
active: llmSettings.allowedTiers.includes(t.tier),
|
||||
onClick: () => toggleAiTier(t.tier),
|
||||
...(t.tier === 'browser' ? { group: 'local-llm' } : {}),
|
||||
})),
|
||||
...TIER_TOGGLE_LIST.filter((t) => t.tier !== 'browser' || webgpuSupported).map((t) => {
|
||||
const isActive = llmSettings.allowedTiers.includes(t.tier);
|
||||
return {
|
||||
id: `ai-tier-${t.tier}`,
|
||||
label: t.shortLabel,
|
||||
icon: isActive ? 'checkCircle' : t.icon,
|
||||
active: isActive,
|
||||
onClick: () => toggleAiTier(t.tier),
|
||||
...(t.tier === 'browser' ? { group: 'local-llm' } : {}),
|
||||
};
|
||||
}),
|
||||
// Browser model status / load button (grouped with the "Lokal" toggle).
|
||||
// Handles all LoadingStatus states so the user sees feedback during
|
||||
// download, initialization, and on error (e.g. worker crash).
|
||||
|
|
@ -234,7 +237,7 @@
|
|||
switch (state) {
|
||||
case 'ready':
|
||||
label = 'Geladen';
|
||||
icon = 'check';
|
||||
icon = 'checkCircle';
|
||||
disabled = true;
|
||||
break;
|
||||
case 'downloading':
|
||||
|
|
@ -280,16 +283,19 @@
|
|||
// STT model selector — each model is a pill, active = currently selected
|
||||
...(sttSupported
|
||||
? (Object.entries(STT_MODELS) as [SttModelKey, (typeof STT_MODELS)[SttModelKey]][]).map(
|
||||
([key, model]) => ({
|
||||
id: `stt-model-${key}`,
|
||||
label: model.displayName,
|
||||
icon: 'mic' as const,
|
||||
active: selectedSttModel === key,
|
||||
onClick: () => {
|
||||
selectedSttModel = key;
|
||||
void loadLocalStt(key);
|
||||
},
|
||||
})
|
||||
([key, model]) => {
|
||||
const isSelected = selectedSttModel === key;
|
||||
return {
|
||||
id: `stt-model-${key}`,
|
||||
label: model.displayName,
|
||||
icon: isSelected ? 'checkCircle' : 'mic',
|
||||
active: isSelected,
|
||||
onClick: () => {
|
||||
selectedSttModel = key;
|
||||
void loadLocalStt(key);
|
||||
},
|
||||
};
|
||||
}
|
||||
)
|
||||
: []),
|
||||
// STT model status (grouped with selected model)
|
||||
|
|
@ -306,7 +312,7 @@
|
|||
switch (state) {
|
||||
case 'ready':
|
||||
label = 'STT bereit';
|
||||
icon = 'check';
|
||||
icon = 'checkCircle';
|
||||
disabled = true;
|
||||
break;
|
||||
case 'downloading':
|
||||
|
|
|
|||
|
|
@ -54,6 +54,14 @@ export default defineConfig({
|
|||
server: {
|
||||
port: 5173,
|
||||
strictPort: true,
|
||||
fs: {
|
||||
// Allow serving files from the monorepo root so that workspace
|
||||
// packages (e.g. @mana/local-llm's Web Worker entry) can be
|
||||
// resolved by Vite's dev server. Without this, worker.ts in
|
||||
// packages/local-llm triggers "request url is outside of Vite
|
||||
// serving allow list".
|
||||
allow: ['../../../..'],
|
||||
},
|
||||
},
|
||||
preview: {
|
||||
port: 4173,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue