mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 18:41:08 +02:00
feat(profile): voice interview with pre-rendered TTS audio + Orpheus/Zonos backends
Voice-based interview for the profile module — users choose between text, voice (question read aloud + mic for answer), or conversation mode (fully automatic flow with auto-save). Interview audio: - 92 pre-rendered MP3 files (23 questions × 4 voices) via Edge TTS - Voices: Seraphina (DE-f), Florian (DE-m), Leni (CH-f), Jan (CH-m) - User picks voice via dropdown, persisted in localStorage - Web Speech API fallback for missing audio files Profile UI: - Interview hero block on overview with 3 start modes (text/voice/conversation) - Voice/conversation toggle + voice picker in interview view - Mic button on text/textarea/tags inputs for per-question voice input - Conversation mode: auto-save + auto-advance after STT transcription - Recording/transcribing/speaking state indicators mana-tts service: - New Orpheus TTS backend (German finetune, SNAC codec) - New Zonos TTS backend (Zyphra, 200k hours, emotion control) - Endpoints: POST /synthesize/orpheus, POST /synthesize/zonos - espeak-ng installed on GPU server for Zonos phonemizer - Compare script for side-by-side voice quality testing Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
786ffd771b
commit
8823cc0bf0
101 changed files with 1597 additions and 31 deletions
|
|
@ -2,7 +2,7 @@
|
|||
Context Interview — Guided question flow that populates userContext.
|
||||
-->
|
||||
<script lang="ts">
|
||||
import { onMount } from 'svelte';
|
||||
import { onMount, onDestroy } from 'svelte';
|
||||
import { useUserContext } from './queries';
|
||||
import { userContextStore } from './stores/user-context.svelte';
|
||||
import {
|
||||
|
|
@ -11,14 +11,19 @@
|
|||
getProgress,
|
||||
type ContextCategory,
|
||||
type ContextQuestion,
|
||||
type QuestionInputType,
|
||||
} from './questions';
|
||||
import { useInterviewTts, VOICES } from './use-interview-tts.svelte';
|
||||
import { useLocalStt } from '$lib/components/voice/use-local-stt.svelte';
|
||||
|
||||
interface Props {
|
||||
limitCategories?: ContextCategory[];
|
||||
compact?: boolean;
|
||||
/** If set, auto-start this voice level on mount. */
|
||||
initialVoiceLevel?: 'voice' | 'conversation';
|
||||
}
|
||||
|
||||
let { limitCategories, compact = false }: Props = $props();
|
||||
let { limitCategories, compact = false, initialVoiceLevel }: Props = $props();
|
||||
|
||||
let ctx$ = useUserContext();
|
||||
let ctx = $derived(ctx$.value);
|
||||
|
|
@ -29,8 +34,27 @@
|
|||
let saving = $state(false);
|
||||
let tagInput = $state('');
|
||||
|
||||
// ── Voice mode ──────────────────────────────────────
|
||||
// 'off' = text only, 'voice' = TTS+STT per question, 'conversation' = auto-save + auto-advance
|
||||
type VoiceLevel = 'off' | 'voice' | 'conversation';
|
||||
const tts = useInterviewTts();
|
||||
const stt = useLocalStt({ language: 'de' });
|
||||
let voiceLevel = $state<VoiceLevel>('off');
|
||||
let voiceMode = $derived(voiceLevel !== 'off');
|
||||
let conversationMode = $derived(voiceLevel === 'conversation');
|
||||
let voiceFlowActive = $state(false);
|
||||
const VOICE_INPUT_TYPES: QuestionInputType[] = ['text', 'textarea', 'tags'];
|
||||
|
||||
onMount(() => {
|
||||
void userContextStore.ensureDoc();
|
||||
if (initialVoiceLevel) {
|
||||
voiceLevel = initialVoiceLevel;
|
||||
}
|
||||
});
|
||||
|
||||
onDestroy(() => {
|
||||
tts.stop();
|
||||
if (stt.state === 'recording') stt.cancel();
|
||||
});
|
||||
|
||||
let categories = $derived(
|
||||
|
|
@ -40,6 +64,9 @@
|
|||
let currentQuestion = $derived(
|
||||
categoryQuestions[currentQuestionIdx] as ContextQuestion | undefined
|
||||
);
|
||||
let currentSupportsVoice = $derived(
|
||||
currentQuestion ? VOICE_INPUT_TYPES.includes(currentQuestion.inputType) : false
|
||||
);
|
||||
let progress = $derived(getProgress(ctx?.interview?.answeredIds ?? []));
|
||||
let answeredSet = $derived(new Set(ctx?.interview?.answeredIds ?? []));
|
||||
let categoryProgress = $derived.by(() => {
|
||||
|
|
@ -71,10 +98,85 @@
|
|||
}
|
||||
|
||||
function selectCategory(key: ContextCategory) {
|
||||
cancelVoiceFlow();
|
||||
activeCategory = key;
|
||||
currentQuestionIdx = 0;
|
||||
}
|
||||
|
||||
// ── Voice flow: TTS → STT → fill input ──────────────
|
||||
async function runVoiceFlow() {
|
||||
if (!currentQuestion || !currentSupportsVoice) return;
|
||||
voiceFlowActive = true;
|
||||
|
||||
// Step 1: Play pre-rendered question audio (falls back to Web Speech API)
|
||||
await tts.speak(currentQuestion.id, currentQuestion.question);
|
||||
|
||||
// Step 2: Start mic recording (STT)
|
||||
if (!voiceFlowActive) return; // cancelled during TTS
|
||||
stt.toggle(); // starts recording
|
||||
}
|
||||
|
||||
// Watch STT text — when transcription completes, fill the input.
|
||||
// In conversation mode: auto-save + auto-advance to next question.
|
||||
$effect(() => {
|
||||
if (stt.state === 'idle' && stt.text && voiceFlowActive) {
|
||||
applyVoiceTranscript(stt.text);
|
||||
voiceFlowActive = false;
|
||||
if (conversationMode) {
|
||||
// Auto-save and advance after a brief pause so the user sees the transcript
|
||||
setTimeout(() => handleAnswer(), 600);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Auto-start voice flow when question changes in voice mode.
|
||||
// Track only the question id to avoid re-triggering when ctx data updates.
|
||||
let prevVoiceQuestionId = $state('');
|
||||
$effect(() => {
|
||||
const qid = currentQuestion?.id ?? '';
|
||||
const shouldRun = voiceMode && currentSupportsVoice && qid && qid !== prevVoiceQuestionId;
|
||||
if (shouldRun) {
|
||||
prevVoiceQuestionId = qid;
|
||||
const timeout = setTimeout(() => runVoiceFlow(), 300);
|
||||
return () => clearTimeout(timeout);
|
||||
}
|
||||
});
|
||||
|
||||
function applyVoiceTranscript(transcript: string) {
|
||||
if (!currentQuestion) return;
|
||||
if (currentQuestion.inputType === 'tags') {
|
||||
// Split transcript into tags by comma, "und", or line breaks
|
||||
const parts = transcript
|
||||
.split(/[,\n]|\bund\b/i)
|
||||
.map((s) => s.trim())
|
||||
.filter(Boolean);
|
||||
const current = Array.isArray(inputValue) ? (inputValue as string[]) : [];
|
||||
const merged = [...current];
|
||||
for (const part of parts) {
|
||||
if (!merged.includes(part)) merged.push(part);
|
||||
}
|
||||
inputValue = merged;
|
||||
} else {
|
||||
// text / textarea — replace content
|
||||
inputValue = transcript;
|
||||
}
|
||||
}
|
||||
|
||||
function toggleMicForCurrentQuestion() {
|
||||
if (stt.state === 'recording') {
|
||||
stt.toggle(); // stop → transcribe
|
||||
} else if (stt.state === 'idle') {
|
||||
voiceFlowActive = true;
|
||||
stt.toggle(); // start recording
|
||||
}
|
||||
}
|
||||
|
||||
function cancelVoiceFlow() {
|
||||
voiceFlowActive = false;
|
||||
tts.stop();
|
||||
if (stt.state === 'recording') stt.cancel();
|
||||
}
|
||||
|
||||
async function handleAnswer() {
|
||||
if (!currentQuestion) return;
|
||||
saving = true;
|
||||
|
|
@ -94,6 +196,7 @@
|
|||
}
|
||||
|
||||
function advanceQuestion() {
|
||||
cancelVoiceFlow();
|
||||
if (currentQuestionIdx < categoryQuestions.length - 1) {
|
||||
currentQuestionIdx++;
|
||||
} else {
|
||||
|
|
@ -150,7 +253,73 @@
|
|||
<div class="progress-bar">
|
||||
<div class="progress-fill" style:width="{progress.percent}%"></div>
|
||||
</div>
|
||||
<p class="progress-text">{progress.answered} von {progress.total} Fragen beantwortet</p>
|
||||
<div class="progress-row">
|
||||
<p class="progress-text">{progress.answered} von {progress.total} Fragen beantwortet</p>
|
||||
{#if tts.isSupported}
|
||||
<div class="voice-controls">
|
||||
<div class="voice-toggles">
|
||||
<button
|
||||
class="voice-toggle"
|
||||
class:active={voiceLevel === 'voice'}
|
||||
onclick={() => {
|
||||
voiceLevel = voiceLevel === 'voice' ? 'off' : 'voice';
|
||||
if (voiceLevel === 'off') cancelVoiceFlow();
|
||||
}}
|
||||
title="Voice-Modus: Fragen werden vorgelesen, Antworten per Sprache"
|
||||
>
|
||||
<svg
|
||||
width="14"
|
||||
height="14"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
</svg>
|
||||
<span>Voice</span>
|
||||
</button>
|
||||
<button
|
||||
class="voice-toggle"
|
||||
class:active={voiceLevel === 'conversation'}
|
||||
onclick={() => {
|
||||
voiceLevel = voiceLevel === 'conversation' ? 'off' : 'conversation';
|
||||
if (voiceLevel === 'off') cancelVoiceFlow();
|
||||
}}
|
||||
title="Gesprächs-Modus: Fließendes Interview — Antworten werden automatisch gespeichert"
|
||||
>
|
||||
<svg
|
||||
width="14"
|
||||
height="14"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z"></path>
|
||||
</svg>
|
||||
<span>Gespräch</span>
|
||||
</button>
|
||||
</div>
|
||||
{#if voiceMode}
|
||||
<select
|
||||
class="voice-picker"
|
||||
value={tts.voice}
|
||||
onchange={(e) => tts.setVoice(e.currentTarget.value as any)}
|
||||
>
|
||||
{#each VOICES as v (v.key)}
|
||||
<option value={v.key}>{v.label}</option>
|
||||
{/each}
|
||||
</select>
|
||||
{/if}
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<div class="categories">
|
||||
|
|
@ -167,29 +336,133 @@
|
|||
{/each}
|
||||
</div>
|
||||
|
||||
{#if conversationMode}
|
||||
<div class="conversation-banner">
|
||||
<span>Gesprächs-Modus aktiv — Antworten werden automatisch gespeichert</span>
|
||||
<button
|
||||
class="banner-stop"
|
||||
onclick={() => {
|
||||
voiceLevel = 'off';
|
||||
cancelVoiceFlow();
|
||||
}}>Beenden</button
|
||||
>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
{#if currentQuestion}
|
||||
<div class="question-card">
|
||||
<h3 class="question-text">{currentQuestion.question}</h3>
|
||||
<div class="question-header">
|
||||
<h3 class="question-text">{currentQuestion.question}</h3>
|
||||
{#if tts.speaking}
|
||||
<span class="voice-indicator speaking" title="Liest vor...">
|
||||
<svg
|
||||
width="18"
|
||||
height="18"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5"></polygon>
|
||||
<path d="M15.54 8.46a5 5 0 0 1 0 7.07"></path>
|
||||
</svg>
|
||||
</span>
|
||||
{/if}
|
||||
</div>
|
||||
{#if currentQuestion.hint}<p class="question-hint">{currentQuestion.hint}</p>{/if}
|
||||
|
||||
{#if stt.state === 'recording'}
|
||||
<div class="voice-status recording">
|
||||
<span class="rec-dot"></span>
|
||||
Aufnahme läuft... ({Math.floor(stt.elapsedMs / 1000)}s)
|
||||
<button class="voice-stop-btn" onclick={() => stt.toggle()}>Stopp</button>
|
||||
</div>
|
||||
{:else if stt.state === 'transcribing'}
|
||||
<div class="voice-status transcribing">
|
||||
<span class="spinner-small"></span>
|
||||
Transkribiere...
|
||||
</div>
|
||||
{:else if stt.state === 'loading'}
|
||||
<div class="voice-status loading">
|
||||
<span class="spinner-small"></span>
|
||||
Lade Sprachmodell...
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<div class="input-area">
|
||||
{#if currentQuestion.inputType === 'text'}
|
||||
<input
|
||||
type="text"
|
||||
class="text-input"
|
||||
bind:value={inputValue}
|
||||
placeholder={currentQuestion.hint ?? ''}
|
||||
disabled={saving}
|
||||
onkeydown={(e) => e.key === 'Enter' && handleAnswer()}
|
||||
/>
|
||||
<div class="input-with-mic">
|
||||
<input
|
||||
type="text"
|
||||
class="text-input"
|
||||
bind:value={inputValue}
|
||||
placeholder={currentQuestion.hint ?? ''}
|
||||
disabled={saving}
|
||||
onkeydown={(e) => e.key === 'Enter' && handleAnswer()}
|
||||
/>
|
||||
{#if stt.isSupported}
|
||||
<button
|
||||
class="mic-btn"
|
||||
class:recording={stt.state === 'recording'}
|
||||
onclick={toggleMicForCurrentQuestion}
|
||||
disabled={saving || stt.state === 'transcribing' || stt.state === 'loading'}
|
||||
title={stt.state === 'recording' ? 'Aufnahme stoppen' : 'Per Sprache antworten'}
|
||||
>
|
||||
<svg
|
||||
width="16"
|
||||
height="16"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
</svg>
|
||||
</button>
|
||||
{/if}
|
||||
</div>
|
||||
{:else if currentQuestion.inputType === 'textarea'}
|
||||
<textarea
|
||||
class="textarea-input"
|
||||
bind:value={inputValue}
|
||||
placeholder={currentQuestion.hint ?? ''}
|
||||
disabled={saving}
|
||||
rows="3"
|
||||
></textarea>
|
||||
<div class="textarea-with-mic">
|
||||
<textarea
|
||||
class="textarea-input"
|
||||
bind:value={inputValue}
|
||||
placeholder={currentQuestion.hint ?? ''}
|
||||
disabled={saving}
|
||||
rows="3"
|
||||
></textarea>
|
||||
{#if stt.isSupported}
|
||||
<button
|
||||
class="mic-btn textarea-mic"
|
||||
class:recording={stt.state === 'recording'}
|
||||
onclick={toggleMicForCurrentQuestion}
|
||||
disabled={saving || stt.state === 'transcribing' || stt.state === 'loading'}
|
||||
title={stt.state === 'recording' ? 'Aufnahme stoppen' : 'Per Sprache antworten'}
|
||||
>
|
||||
<svg
|
||||
width="16"
|
||||
height="16"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
</svg>
|
||||
</button>
|
||||
{/if}
|
||||
</div>
|
||||
{:else if currentQuestion.inputType === 'time'}
|
||||
<input type="time" class="time-input" bind:value={inputValue} disabled={saving} />
|
||||
{:else if currentQuestion.inputType === 'choice'}
|
||||
|
|
@ -214,15 +487,42 @@
|
|||
>{/each}
|
||||
</div>
|
||||
{/if}
|
||||
<input
|
||||
type="text"
|
||||
class="text-input"
|
||||
bind:value={tagInput}
|
||||
placeholder={currentQuestion.hint ?? 'Eingabe + Enter'}
|
||||
disabled={saving}
|
||||
onkeydown={handleTagKeydown}
|
||||
onblur={addTag}
|
||||
/>
|
||||
<div class="input-with-mic">
|
||||
<input
|
||||
type="text"
|
||||
class="text-input"
|
||||
bind:value={tagInput}
|
||||
placeholder={currentQuestion.hint ?? 'Eingabe + Enter'}
|
||||
disabled={saving}
|
||||
onkeydown={handleTagKeydown}
|
||||
onblur={addTag}
|
||||
/>
|
||||
{#if stt.isSupported}
|
||||
<button
|
||||
class="mic-btn"
|
||||
class:recording={stt.state === 'recording'}
|
||||
onclick={toggleMicForCurrentQuestion}
|
||||
disabled={saving || stt.state === 'transcribing' || stt.state === 'loading'}
|
||||
title={stt.state === 'recording' ? 'Aufnahme stoppen' : 'Per Sprache antworten'}
|
||||
>
|
||||
<svg
|
||||
width="16"
|
||||
height="16"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="2"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
</svg>
|
||||
</button>
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
{:else if currentQuestion.inputType === 'weekdays'}
|
||||
<div class="weekdays">
|
||||
|
|
@ -564,4 +864,217 @@
|
|||
border: none;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
/* ── Voice mode ────────────────────────────── */
|
||||
.progress-row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
.voice-controls {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.375rem;
|
||||
}
|
||||
.voice-toggles {
|
||||
display: flex;
|
||||
gap: 0.25rem;
|
||||
}
|
||||
.voice-picker {
|
||||
padding: 0.25rem 0.5rem;
|
||||
border: 1px solid hsl(var(--color-border));
|
||||
border-radius: 999px;
|
||||
background: transparent;
|
||||
color: hsl(var(--color-foreground));
|
||||
font-size: 0.6875rem;
|
||||
outline: none;
|
||||
cursor: pointer;
|
||||
}
|
||||
.voice-picker:focus {
|
||||
border-color: hsl(var(--color-primary));
|
||||
}
|
||||
.voice-toggle {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 0.375rem;
|
||||
padding: 0.25rem 0.625rem;
|
||||
border: 1px solid hsl(var(--color-border));
|
||||
border-radius: 999px;
|
||||
background: transparent;
|
||||
color: hsl(var(--color-muted-foreground));
|
||||
font-size: 0.6875rem;
|
||||
cursor: pointer;
|
||||
transition:
|
||||
background 0.15s,
|
||||
border-color 0.15s,
|
||||
color 0.15s;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.voice-toggle:hover {
|
||||
background: hsl(var(--color-surface-hover));
|
||||
}
|
||||
.voice-toggle.active {
|
||||
background: hsl(var(--color-primary) / 0.1);
|
||||
border-color: hsl(var(--color-primary));
|
||||
color: hsl(var(--color-primary));
|
||||
}
|
||||
.question-header {
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
.question-header .question-text {
|
||||
flex: 1;
|
||||
}
|
||||
.voice-indicator {
|
||||
flex-shrink: 0;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
color: hsl(var(--color-primary));
|
||||
}
|
||||
.voice-indicator.speaking {
|
||||
animation: pulse-voice 1s ease-in-out infinite;
|
||||
}
|
||||
@keyframes pulse-voice {
|
||||
0%,
|
||||
100% {
|
||||
opacity: 1;
|
||||
}
|
||||
50% {
|
||||
opacity: 0.4;
|
||||
}
|
||||
}
|
||||
.voice-status {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
padding: 0.5rem 0.75rem;
|
||||
border-radius: 0.5rem;
|
||||
font-size: 0.8125rem;
|
||||
}
|
||||
.voice-status.recording {
|
||||
background: hsl(0 70% 50% / 0.08);
|
||||
color: hsl(0 70% 45%);
|
||||
}
|
||||
.voice-status.transcribing,
|
||||
.voice-status.loading {
|
||||
background: hsl(var(--color-primary) / 0.08);
|
||||
color: hsl(var(--color-primary));
|
||||
}
|
||||
.rec-dot {
|
||||
width: 0.5rem;
|
||||
height: 0.5rem;
|
||||
border-radius: 50%;
|
||||
background: hsl(0 70% 50%);
|
||||
animation: pulse-rec 1s ease-in-out infinite;
|
||||
}
|
||||
@keyframes pulse-rec {
|
||||
0%,
|
||||
100% {
|
||||
opacity: 1;
|
||||
transform: scale(1);
|
||||
}
|
||||
50% {
|
||||
opacity: 0.5;
|
||||
transform: scale(1.3);
|
||||
}
|
||||
}
|
||||
.voice-stop-btn {
|
||||
margin-left: auto;
|
||||
padding: 0.25rem 0.625rem;
|
||||
border: 1px solid currentColor;
|
||||
border-radius: 999px;
|
||||
background: transparent;
|
||||
color: inherit;
|
||||
font-size: 0.75rem;
|
||||
cursor: pointer;
|
||||
}
|
||||
.spinner-small {
|
||||
width: 0.875rem;
|
||||
height: 0.875rem;
|
||||
border: 2px solid currentColor;
|
||||
border-top-color: transparent;
|
||||
border-radius: 50%;
|
||||
animation: spin 0.6s linear infinite;
|
||||
}
|
||||
@keyframes spin {
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
.input-with-mic {
|
||||
display: flex;
|
||||
gap: 0.375rem;
|
||||
align-items: center;
|
||||
}
|
||||
.input-with-mic .text-input {
|
||||
flex: 1;
|
||||
}
|
||||
.textarea-with-mic {
|
||||
position: relative;
|
||||
}
|
||||
.textarea-with-mic .textarea-input {
|
||||
width: 100%;
|
||||
}
|
||||
.mic-btn {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
width: 2.25rem;
|
||||
height: 2.25rem;
|
||||
border: 1px solid hsl(var(--color-border));
|
||||
border-radius: 0.5rem;
|
||||
background: transparent;
|
||||
color: hsl(var(--color-muted-foreground));
|
||||
cursor: pointer;
|
||||
flex-shrink: 0;
|
||||
transition:
|
||||
background 0.15s,
|
||||
border-color 0.15s,
|
||||
color 0.15s;
|
||||
}
|
||||
.mic-btn:hover:not(:disabled) {
|
||||
background: hsl(var(--color-surface-hover));
|
||||
color: hsl(var(--color-foreground));
|
||||
}
|
||||
.mic-btn.recording {
|
||||
background: hsl(0 70% 50% / 0.1);
|
||||
border-color: hsl(0 70% 50%);
|
||||
color: hsl(0 70% 45%);
|
||||
animation: pulse-rec 1s ease-in-out infinite;
|
||||
}
|
||||
.mic-btn:disabled {
|
||||
opacity: 0.4;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
.textarea-mic {
|
||||
position: absolute;
|
||||
right: 0.375rem;
|
||||
bottom: 0.375rem;
|
||||
}
|
||||
.conversation-banner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: 0.5rem;
|
||||
padding: 0.5rem 0.75rem;
|
||||
border-radius: 0.5rem;
|
||||
background: hsl(var(--color-primary) / 0.08);
|
||||
color: hsl(var(--color-primary));
|
||||
font-size: 0.75rem;
|
||||
}
|
||||
.banner-stop {
|
||||
padding: 0.25rem 0.625rem;
|
||||
border: 1px solid hsl(var(--color-primary) / 0.3);
|
||||
border-radius: 999px;
|
||||
background: transparent;
|
||||
color: hsl(var(--color-primary));
|
||||
font-size: 0.6875rem;
|
||||
cursor: pointer;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.banner-stop:hover {
|
||||
background: hsl(var(--color-primary) / 0.1);
|
||||
}
|
||||
</style>
|
||||
|
|
|
|||
|
|
@ -15,17 +15,25 @@
|
|||
import ContextOverview from './ContextOverview.svelte';
|
||||
import ContextInterview from './ContextInterview.svelte';
|
||||
import ContextFreeform from './ContextFreeform.svelte';
|
||||
import { useUserContext } from './queries';
|
||||
import { getProgress } from './questions';
|
||||
|
||||
type Tab = 'overview' | 'interview' | 'freeform' | 'account';
|
||||
type InterviewStartMode = 'text' | 'voice' | 'conversation';
|
||||
|
||||
let apiProfile = $state<ApiUserProfile | null>(null);
|
||||
let loading = $state(true);
|
||||
let activeTab = $state<Tab>('overview');
|
||||
let interviewStartMode = $state<InterviewStartMode | null>(null);
|
||||
|
||||
let showEditModal = $state(false);
|
||||
let showPasswordModal = $state(false);
|
||||
let showDeleteModal = $state(false);
|
||||
|
||||
let ctx$ = useUserContext();
|
||||
let ctx = $derived(ctx$.value);
|
||||
let progress = $derived(getProgress(ctx?.interview?.answeredIds ?? []));
|
||||
|
||||
onMount(async () => {
|
||||
try {
|
||||
apiProfile = await profileService.getProfile();
|
||||
|
|
@ -43,6 +51,11 @@
|
|||
{ key: 'account', label: 'Konto' },
|
||||
];
|
||||
|
||||
function startInterview(mode: InterviewStartMode) {
|
||||
interviewStartMode = mode;
|
||||
activeTab = 'interview';
|
||||
}
|
||||
|
||||
function handleProfileUpdate(user: ApiUserProfile) {
|
||||
apiProfile = user;
|
||||
toast.success('Profil erfolgreich aktualisiert');
|
||||
|
|
@ -71,7 +84,10 @@
|
|||
<button
|
||||
class="tab-btn"
|
||||
class:active={activeTab === tab.key}
|
||||
onclick={() => (activeTab = tab.key)}
|
||||
onclick={() => {
|
||||
activeTab = tab.key;
|
||||
if (tab.key !== 'interview') interviewStartMode = null;
|
||||
}}
|
||||
>
|
||||
{tab.label}
|
||||
</button>
|
||||
|
|
@ -81,9 +97,99 @@
|
|||
<!-- Tab content -->
|
||||
<div class="tab-content">
|
||||
{#if activeTab === 'overview'}
|
||||
<ContextOverview user={apiProfile} onStartInterview={() => (activeTab = 'interview')} />
|
||||
<ContextOverview user={apiProfile} onStartInterview={() => startInterview('text')} />
|
||||
|
||||
<!-- Interview start hero -->
|
||||
<div class="interview-hero">
|
||||
<div class="hero-header">
|
||||
<h3 class="hero-title">Interview starten</h3>
|
||||
<p class="hero-subtitle">
|
||||
{#if progress.percent > 0}
|
||||
{progress.answered} von {progress.total} Fragen beantwortet — mach weiter!
|
||||
{:else}
|
||||
Erzähl Mana mehr über dich, damit die App besser zu dir passt.
|
||||
{/if}
|
||||
</p>
|
||||
{#if progress.percent > 0}
|
||||
<div class="hero-progress">
|
||||
<div class="hero-progress-fill" style:width="{progress.percent}%"></div>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
<div class="hero-options">
|
||||
<button class="hero-option" onclick={() => startInterview('text')}>
|
||||
<span class="hero-option-icon">
|
||||
<svg
|
||||
width="24"
|
||||
height="24"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="1.5"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path d="M17 3a2.85 2.83 0 1 1 4 4L7.5 20.5 2 22l1.5-5.5Z"></path>
|
||||
</svg>
|
||||
</span>
|
||||
<span class="hero-option-text">
|
||||
<strong>Per Text</strong>
|
||||
<span>Fragen lesen und tippen</span>
|
||||
</span>
|
||||
</button>
|
||||
<button class="hero-option voice" onclick={() => startInterview('voice')}>
|
||||
<span class="hero-option-icon">
|
||||
<svg
|
||||
width="24"
|
||||
height="24"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="1.5"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
||||
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
||||
<line x1="12" y1="19" x2="12" y2="23"></line>
|
||||
<line x1="8" y1="23" x2="16" y2="23"></line>
|
||||
</svg>
|
||||
</span>
|
||||
<span class="hero-option-text">
|
||||
<strong>Per Sprache</strong>
|
||||
<span>Fragen hören und sprechen</span>
|
||||
</span>
|
||||
</button>
|
||||
<button class="hero-option conversation" onclick={() => startInterview('conversation')}>
|
||||
<span class="hero-option-icon">
|
||||
<svg
|
||||
width="24"
|
||||
height="24"
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
stroke-width="1.5"
|
||||
stroke-linecap="round"
|
||||
stroke-linejoin="round"
|
||||
>
|
||||
<path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z"></path>
|
||||
</svg>
|
||||
</span>
|
||||
<span class="hero-option-text">
|
||||
<strong>Als Gespräch</strong>
|
||||
<span>Fließend — Antworten werden automatisch gespeichert</span>
|
||||
</span>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
{:else if activeTab === 'interview'}
|
||||
<ContextInterview />
|
||||
<ContextInterview
|
||||
initialVoiceLevel={interviewStartMode === 'conversation'
|
||||
? 'conversation'
|
||||
: interviewStartMode === 'voice'
|
||||
? 'voice'
|
||||
: undefined}
|
||||
/>
|
||||
{:else if activeTab === 'freeform'}
|
||||
<ContextFreeform />
|
||||
{:else if activeTab === 'account'}
|
||||
|
|
@ -280,4 +386,95 @@
|
|||
.account-btn.danger:hover {
|
||||
background: hsl(var(--color-destructive, 0 84% 60%) / 0.08);
|
||||
}
|
||||
|
||||
/* ── Interview hero ──────────────────────── */
|
||||
.interview-hero {
|
||||
margin-top: 1rem;
|
||||
border: 1px solid hsl(var(--color-border));
|
||||
border-radius: 0.75rem;
|
||||
background: hsl(var(--color-card));
|
||||
overflow: hidden;
|
||||
}
|
||||
.hero-header {
|
||||
padding: 1.25rem 1.25rem 1rem;
|
||||
}
|
||||
.hero-title {
|
||||
margin: 0;
|
||||
font-size: 1.0625rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
.hero-subtitle {
|
||||
margin: 0.25rem 0 0;
|
||||
font-size: 0.8125rem;
|
||||
color: hsl(var(--color-muted-foreground));
|
||||
}
|
||||
.hero-progress {
|
||||
height: 4px;
|
||||
margin-top: 0.75rem;
|
||||
background: hsl(var(--color-border));
|
||||
border-radius: 2px;
|
||||
overflow: hidden;
|
||||
}
|
||||
.hero-progress-fill {
|
||||
height: 100%;
|
||||
background: hsl(var(--color-primary));
|
||||
border-radius: 2px;
|
||||
transition: width 0.3s ease;
|
||||
}
|
||||
.hero-options {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
border-top: 1px solid hsl(var(--color-border));
|
||||
}
|
||||
.hero-option {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.875rem;
|
||||
padding: 1rem 1.25rem;
|
||||
border: none;
|
||||
border-bottom: 1px solid hsl(var(--color-border));
|
||||
background: transparent;
|
||||
color: hsl(var(--color-foreground));
|
||||
cursor: pointer;
|
||||
text-align: left;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.hero-option:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
.hero-option:hover {
|
||||
background: hsl(var(--color-surface-hover));
|
||||
}
|
||||
.hero-option-icon {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
width: 2.5rem;
|
||||
height: 2.5rem;
|
||||
border-radius: 0.625rem;
|
||||
background: hsl(var(--color-muted) / 0.5);
|
||||
color: hsl(var(--color-muted-foreground));
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.hero-option.voice .hero-option-icon {
|
||||
background: hsl(var(--color-primary) / 0.1);
|
||||
color: hsl(var(--color-primary));
|
||||
}
|
||||
.hero-option.conversation .hero-option-icon {
|
||||
background: hsl(142 71% 45% / 0.1);
|
||||
color: hsl(142 71% 35%);
|
||||
}
|
||||
.hero-option-text {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.125rem;
|
||||
}
|
||||
.hero-option-text strong {
|
||||
font-size: 0.875rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
.hero-option-text span {
|
||||
font-size: 0.75rem;
|
||||
color: hsl(var(--color-muted-foreground));
|
||||
}
|
||||
</style>
|
||||
|
|
|
|||
|
|
@ -0,0 +1,159 @@
|
|||
/**
|
||||
* useInterviewTts() — Plays pre-rendered interview question audio.
|
||||
*
|
||||
* Audio files live in /audio/interview/{voiceKey}/{questionId}.mp3
|
||||
* where voiceKey is one of: de-f, de-m, ch-f, ch-m
|
||||
*
|
||||
* Falls back to Web Speech API if the audio file is missing.
|
||||
*/
|
||||
|
||||
export type VoiceKey = 'de-f' | 'de-m' | 'ch-f' | 'ch-m';
|
||||
|
||||
export interface VoiceMeta {
|
||||
key: VoiceKey;
|
||||
label: string;
|
||||
lang: string;
|
||||
gender: string;
|
||||
}
|
||||
|
||||
export const VOICES: VoiceMeta[] = [
|
||||
{ key: 'de-f', label: 'Seraphina (DE)', lang: 'Deutsch', gender: 'Weiblich' },
|
||||
{ key: 'de-m', label: 'Florian (DE)', lang: 'Deutsch', gender: 'Männlich' },
|
||||
{ key: 'ch-f', label: 'Leni (CH)', lang: 'Schweizerdeutsch', gender: 'Weiblich' },
|
||||
{ key: 'ch-m', label: 'Jan (CH)', lang: 'Schweizerdeutsch', gender: 'Männlich' },
|
||||
];
|
||||
|
||||
const STORAGE_KEY = 'mana.interview.voice';
|
||||
const DEFAULT_VOICE: VoiceKey = 'de-f';
|
||||
|
||||
export interface InterviewTtsHandle {
|
||||
/** Whether audio is currently playing */
|
||||
readonly speaking: boolean;
|
||||
/** Always true — we have pre-rendered audio */
|
||||
readonly isSupported: boolean;
|
||||
/** Currently selected voice */
|
||||
readonly voice: VoiceKey;
|
||||
/** Set the voice */
|
||||
setVoice: (key: VoiceKey) => void;
|
||||
/** Play the audio for a question. Resolves when done. */
|
||||
speak: (questionId: string, fallbackText?: string) => Promise<void>;
|
||||
/** Stop playback immediately. */
|
||||
stop: () => void;
|
||||
}
|
||||
|
||||
export function useInterviewTts(): InterviewTtsHandle {
|
||||
let speaking = $state(false);
|
||||
let voice = $state<VoiceKey>(loadVoice());
|
||||
let currentAudio: HTMLAudioElement | null = null;
|
||||
|
||||
function loadVoice(): VoiceKey {
|
||||
if (typeof window === 'undefined') return DEFAULT_VOICE;
|
||||
const stored = localStorage.getItem(STORAGE_KEY);
|
||||
if (stored && VOICES.some((v) => v.key === stored)) return stored as VoiceKey;
|
||||
return DEFAULT_VOICE;
|
||||
}
|
||||
|
||||
function setVoice(key: VoiceKey) {
|
||||
voice = key;
|
||||
if (typeof window !== 'undefined') {
|
||||
localStorage.setItem(STORAGE_KEY, key);
|
||||
}
|
||||
}
|
||||
|
||||
function speak(questionId: string, fallbackText?: string): Promise<void> {
|
||||
stop();
|
||||
|
||||
const audioUrl = `/audio/interview/${voice}/${questionId}.mp3`;
|
||||
|
||||
return new Promise<void>((resolve) => {
|
||||
const audio = new Audio(audioUrl);
|
||||
currentAudio = audio;
|
||||
|
||||
audio.addEventListener(
|
||||
'canplaythrough',
|
||||
() => {
|
||||
speaking = true;
|
||||
audio.play().catch(() => {
|
||||
// Autoplay blocked — try Web Speech API fallback
|
||||
speaking = false;
|
||||
if (fallbackText) {
|
||||
speakFallback(fallbackText).then(resolve);
|
||||
} else {
|
||||
resolve();
|
||||
}
|
||||
});
|
||||
},
|
||||
{ once: true }
|
||||
);
|
||||
|
||||
audio.addEventListener(
|
||||
'ended',
|
||||
() => {
|
||||
speaking = false;
|
||||
currentAudio = null;
|
||||
resolve();
|
||||
},
|
||||
{ once: true }
|
||||
);
|
||||
|
||||
audio.addEventListener(
|
||||
'error',
|
||||
() => {
|
||||
// File not found — fallback to Web Speech API
|
||||
speaking = false;
|
||||
currentAudio = null;
|
||||
if (fallbackText) {
|
||||
speakFallback(fallbackText).then(resolve);
|
||||
} else {
|
||||
resolve();
|
||||
}
|
||||
},
|
||||
{ once: true }
|
||||
);
|
||||
|
||||
audio.load();
|
||||
});
|
||||
}
|
||||
|
||||
function stop() {
|
||||
if (currentAudio) {
|
||||
currentAudio.pause();
|
||||
currentAudio.src = '';
|
||||
currentAudio = null;
|
||||
}
|
||||
speaking = false;
|
||||
}
|
||||
|
||||
return {
|
||||
get speaking() {
|
||||
return speaking;
|
||||
},
|
||||
get isSupported() {
|
||||
return true;
|
||||
},
|
||||
get voice() {
|
||||
return voice;
|
||||
},
|
||||
setVoice,
|
||||
speak,
|
||||
stop,
|
||||
};
|
||||
}
|
||||
|
||||
/** Web Speech API fallback for missing audio files. */
|
||||
function speakFallback(text: string): Promise<void> {
|
||||
if (typeof window === 'undefined' || !('speechSynthesis' in window)) {
|
||||
return Promise.resolve();
|
||||
}
|
||||
|
||||
speechSynthesis.cancel();
|
||||
|
||||
return new Promise<void>((resolve) => {
|
||||
const utterance = new SpeechSynthesisUtterance(text);
|
||||
utterance.lang = 'de-DE';
|
||||
utterance.rate = 0.92;
|
||||
utterance.onend = () => resolve();
|
||||
utterance.onerror = () => resolve();
|
||||
speechSynthesis.speak(utterance);
|
||||
});
|
||||
}
|
||||
BIN
apps/mana/apps/web/static/audio/interview/ch-f/about.bio.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-f/about.bio.mp3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-f/goals.current.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-f/goals.current.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-f/goals.focus.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-f/goals.focus.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-f/goals.learn.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-f/goals.learn.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-f/interests.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-f/interests.mp3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-f/social.living.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-f/social.living.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-f/social.pets.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-f/social.pets.mp3
Normal file
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-m/about.bio.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-m/about.bio.mp3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-m/goals.current.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-m/goals.current.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-m/goals.focus.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-m/goals.focus.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-m/goals.learn.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-m/goals.learn.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-m/interests.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-m/interests.mp3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-m/social.living.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-m/social.living.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/ch-m/social.pets.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/ch-m/social.pets.mp3
Normal file
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-f/about.bio.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-f/about.bio.mp3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-f/goals.current.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-f/goals.current.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-f/goals.focus.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-f/goals.focus.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-f/goals.learn.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-f/goals.learn.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-f/interests.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-f/interests.mp3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-f/social.living.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-f/social.living.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-f/social.pets.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-f/social.pets.mp3
Normal file
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-m/about.bio.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-m/about.bio.mp3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-m/goals.current.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-m/goals.current.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-m/goals.focus.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-m/goals.focus.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-m/goals.learn.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-m/goals.learn.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-m/interests.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-m/interests.mp3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-m/social.living.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-m/social.living.mp3
Normal file
Binary file not shown.
BIN
apps/mana/apps/web/static/audio/interview/de-m/social.pets.mp3
Normal file
BIN
apps/mana/apps/web/static/audio/interview/de-m/social.pets.mp3
Normal file
Binary file not shown.
Binary file not shown.
|
|
@ -16,6 +16,8 @@ Text-to-Speech microservice. Wraps Kokoro (English presets), Piper (German, loca
|
|||
| **Framework** | FastAPI |
|
||||
| **English (preset)** | Kokoro-82M (`kokoro_service.py`) |
|
||||
| **German (local)** | Piper ONNX with `kerstin_low.onnx` and `thorsten_medium.onnx` voices (`piper_service.py`) |
|
||||
| **German (high-quality)** | Orpheus-3B German finetune (`orpheus_service.py`) — best for pre-generation |
|
||||
| **Multilingual (expressive)** | Zonos v0.1 by Zyphra (`zonos_service.py`) — emotion control, 200k hours training |
|
||||
| **Voice cloning** | F5-TTS on CUDA (`f5_service.py`) |
|
||||
| **Audio I/O** | `soundfile`, `pydub` |
|
||||
| **Auth** | Per-key + internal-key API auth (`auth.py`) + JWT via mana-auth (`external_auth.py`) |
|
||||
|
|
@ -43,6 +45,8 @@ Public URL: `https://gpu-tts.mana.how`.
|
|||
| DELETE | `/voices/{voice_id}` | Delete a custom voice |
|
||||
| POST | `/synthesize/kokoro` | Kokoro synthesis (English presets) |
|
||||
| POST | `/synthesize` | F5-TTS voice cloning |
|
||||
| POST | `/synthesize/orpheus` | Orpheus synthesis (German, high-quality, pre-generation) |
|
||||
| POST | `/synthesize/zonos` | Zonos synthesis (multilingual, expressive, emotion control) |
|
||||
| POST | `/synthesize/auto` | Routing helper — picks the right backend for the requested voice |
|
||||
|
||||
All non-health endpoints require `Authorization: Bearer <token>` (per-app key, internal key, or mana-auth JWT).
|
||||
|
|
@ -59,6 +63,12 @@ All non-health endpoints require `Authorization: Bearer <token>` (per-app key, i
|
|||
|
||||
Fallback to Edge TTS cloud voices if Piper isn't loaded.
|
||||
|
||||
### Orpheus-3B German (high-quality pre-generation)
|
||||
~8 GB VRAM. German finetune (`Kartoffel/Orpheus-3B_german_natural-v0.1`). Natural intonation, built-in speaker voices (tara, leo, emma, ...). Best quality for pre-generating static audio files. Not real-time.
|
||||
|
||||
### Zonos v0.1 (expressive multilingual)
|
||||
~5 GB VRAM. By Zyphra, trained on 200k hours. Explicit German support. Fine-grained control: emotion (neutral/friendly/warm/curious), speaking rate, pitch variation. Can clone voices from 5s reference audio.
|
||||
|
||||
### F5-TTS (voice cloning)
|
||||
~6 GB. Requires reference audio + transcript. Higher quality, slower. Custom voices live in `voices/` (reference audio + transcript per voice ID).
|
||||
|
||||
|
|
@ -84,6 +94,8 @@ services/mana-tts/
|
|||
│ ├── kokoro_service.py # Kokoro (English presets)
|
||||
│ ├── piper_service.py # Piper (German, local ONNX)
|
||||
│ ├── f5_service.py # F5-TTS (voice cloning, CUDA)
|
||||
│ ├── orpheus_service.py # Orpheus-3B German (high-quality)
|
||||
│ ├── zonos_service.py # Zonos v0.1 (expressive multilingual)
|
||||
│ ├── voice_manager.py # Custom voice registry
|
||||
│ ├── audio_utils.py # Format conversion, resampling
|
||||
│ ├── auth.py # API-key auth
|
||||
|
|
|
|||
|
|
@ -42,6 +42,17 @@ from .piper_service import (
|
|||
PIPER_VOICES,
|
||||
is_piper_loaded,
|
||||
)
|
||||
from .orpheus_service import (
|
||||
synthesize_orpheus,
|
||||
is_orpheus_loaded,
|
||||
ORPHEUS_VOICES,
|
||||
DEFAULT_VOICE as DEFAULT_ORPHEUS_VOICE,
|
||||
)
|
||||
from .zonos_service import (
|
||||
synthesize_zonos,
|
||||
is_zonos_loaded,
|
||||
EMOTION_PRESETS as ZONOS_EMOTIONS,
|
||||
)
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
|
|
@ -203,6 +214,8 @@ async def health_check():
|
|||
models_loaded={
|
||||
"kokoro": is_kokoro_loaded(),
|
||||
"f5": is_f5_loaded(),
|
||||
"orpheus": is_orpheus_loaded(),
|
||||
"zonos": is_zonos_loaded(),
|
||||
},
|
||||
auth_required=REQUIRE_AUTH,
|
||||
)
|
||||
|
|
@ -528,6 +541,160 @@ async def synthesize_with_f5(
|
|||
cleanup_temp_file(temp_file_path)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Orpheus TTS Endpoint (German, high-quality)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class OrpheusRequest(BaseModel):
|
||||
"""Request for Orpheus TTS synthesis."""
|
||||
|
||||
text: str = Field(..., description="Text to synthesize (German)", max_length=5000)
|
||||
voice: str = Field(DEFAULT_ORPHEUS_VOICE, description="Speaker voice")
|
||||
output_format: str = Field("wav", description="Output format (wav, mp3)")
|
||||
temperature: float = Field(0.6, ge=0.1, le=1.5, description="Sampling temperature")
|
||||
|
||||
|
||||
@app.post("/synthesize/orpheus")
|
||||
async def synthesize_with_orpheus(
|
||||
request: OrpheusRequest,
|
||||
auth: AuthResult = Depends(verify_api_key),
|
||||
):
|
||||
"""
|
||||
Synthesize German speech using Orpheus TTS.
|
||||
|
||||
High-quality German synthesis with natural intonation.
|
||||
Not optimized for real-time — designed for pre-generation.
|
||||
"""
|
||||
if not request.text.strip():
|
||||
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
||||
|
||||
if len(request.text) > MAX_TEXT_LENGTH:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Text exceeds maximum length of {MAX_TEXT_LENGTH} characters",
|
||||
)
|
||||
|
||||
output_format = request.output_format.lower()
|
||||
if output_format not in SUPPORTED_FORMATS:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported format. Use one of: {SUPPORTED_FORMATS}",
|
||||
)
|
||||
|
||||
try:
|
||||
result = await synthesize_orpheus(
|
||||
text=request.text,
|
||||
voice=request.voice,
|
||||
temperature=request.temperature,
|
||||
)
|
||||
|
||||
audio_bytes, content_type = convert_audio(
|
||||
result.audio,
|
||||
result.sample_rate,
|
||||
output_format,
|
||||
)
|
||||
|
||||
return Response(
|
||||
content=audio_bytes,
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"X-Model": "orpheus-german",
|
||||
"X-Voice": result.voice,
|
||||
"X-Duration": str(result.duration),
|
||||
"X-Sample-Rate": str(result.sample_rate),
|
||||
},
|
||||
)
|
||||
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"Orpheus synthesis error: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Orpheus synthesis failed: {e}")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Zonos TTS Endpoint (Multilingual, expressive)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class ZonosRequest(BaseModel):
|
||||
"""Request for Zonos TTS synthesis."""
|
||||
|
||||
text: str = Field(..., description="Text to synthesize", max_length=5000)
|
||||
language: str = Field("de", description="Language code")
|
||||
emotion: str = Field("friendly", description="Emotion preset: neutral, friendly, warm, curious")
|
||||
speaking_rate: float = Field(13.0, ge=5.0, le=25.0, description="Phonemes per second")
|
||||
pitch_std: float = Field(20.0, ge=5.0, le=50.0, description="Pitch variation in Hz")
|
||||
output_format: str = Field("wav", description="Output format (wav, mp3)")
|
||||
|
||||
|
||||
@app.post("/synthesize/zonos")
|
||||
async def synthesize_with_zonos(
|
||||
request: ZonosRequest,
|
||||
auth: AuthResult = Depends(verify_api_key),
|
||||
):
|
||||
"""
|
||||
Synthesize speech using Zonos TTS by Zyphra.
|
||||
|
||||
Expressive multilingual synthesis with emotion control.
|
||||
Trained on 200k hours — explicit German support.
|
||||
"""
|
||||
if not request.text.strip():
|
||||
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
||||
|
||||
if len(request.text) > MAX_TEXT_LENGTH:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Text exceeds maximum length of {MAX_TEXT_LENGTH} characters",
|
||||
)
|
||||
|
||||
output_format = request.output_format.lower()
|
||||
if output_format not in SUPPORTED_FORMATS:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported format. Use one of: {SUPPORTED_FORMATS}",
|
||||
)
|
||||
|
||||
if request.emotion not in ZONOS_EMOTIONS:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unknown emotion. Use one of: {list(ZONOS_EMOTIONS.keys())}",
|
||||
)
|
||||
|
||||
try:
|
||||
result = await synthesize_zonos(
|
||||
text=request.text,
|
||||
language=request.language,
|
||||
emotion=request.emotion,
|
||||
speaking_rate=request.speaking_rate,
|
||||
pitch_std=request.pitch_std,
|
||||
)
|
||||
|
||||
audio_bytes, content_type = convert_audio(
|
||||
result.audio,
|
||||
result.sample_rate,
|
||||
output_format,
|
||||
)
|
||||
|
||||
return Response(
|
||||
content=audio_bytes,
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"X-Model": "zonos-v0.1",
|
||||
"X-Emotion": result.emotion,
|
||||
"X-Duration": str(result.duration),
|
||||
"X-Sample-Rate": str(result.sample_rate),
|
||||
},
|
||||
)
|
||||
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"Zonos synthesis error: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Zonos synthesis failed: {e}")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Auto-Selection Endpoint
|
||||
# ============================================================================
|
||||
|
|
|
|||
229
services/mana-tts/app/orpheus_service.py
Normal file
229
services/mana-tts/app/orpheus_service.py
Normal file
|
|
@ -0,0 +1,229 @@
|
|||
"""
|
||||
Orpheus TTS — High-quality German speech synthesis.
|
||||
|
||||
Uses the Orpheus-TTS model with German finetune for natural-sounding
|
||||
interview question generation. Not optimized for real-time — quality first.
|
||||
|
||||
Model: Kartoffel_Orpheus-3B_german_natural-v0.1 (HuggingFace)
|
||||
VRAM: ~8 GB (fits comfortably on RTX 3090 alongside other models)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Lazy-loaded model state
|
||||
_model = None
|
||||
_tokenizer = None
|
||||
_loaded = False
|
||||
|
||||
MODEL_ID = "Vishalshendge3198/orpheus-3b-tts-german-emotional-merged"
|
||||
SAMPLE_RATE = 24000
|
||||
|
||||
# Available voices (Orpheus built-in speaker tags)
|
||||
ORPHEUS_VOICES = {
|
||||
"tara": "Female, warm and clear (default)",
|
||||
"leah": "Female, soft and friendly",
|
||||
"jess": "Female, energetic",
|
||||
"leo": "Male, calm and professional",
|
||||
"dan": "Male, deep and warm",
|
||||
"mia": "Female, young and bright",
|
||||
"zac": "Male, confident",
|
||||
"emma": "Female, neutral",
|
||||
}
|
||||
|
||||
DEFAULT_VOICE = "tara"
|
||||
|
||||
|
||||
@dataclass
|
||||
class OrpheusResult:
|
||||
audio: np.ndarray
|
||||
sample_rate: int
|
||||
duration: float
|
||||
voice: str
|
||||
|
||||
|
||||
def is_orpheus_loaded() -> bool:
|
||||
return _loaded
|
||||
|
||||
|
||||
def get_orpheus_model():
|
||||
"""Load the Orpheus German model (lazy, first call only)."""
|
||||
global _model, _tokenizer, _loaded
|
||||
|
||||
if _loaded:
|
||||
return _model, _tokenizer
|
||||
|
||||
logger.info(f"Loading Orpheus German model: {MODEL_ID}")
|
||||
|
||||
try:
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
import torch
|
||||
|
||||
_tokenizer = AutoTokenizer.from_pretrained(
|
||||
MODEL_ID,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
_model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="cuda",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
_model.eval()
|
||||
_loaded = True
|
||||
logger.info("Orpheus German model loaded successfully")
|
||||
return _model, _tokenizer
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load Orpheus model: {e}")
|
||||
raise RuntimeError(f"Failed to load Orpheus model: {e}")
|
||||
|
||||
|
||||
def unload_orpheus():
|
||||
"""Free VRAM by unloading the model."""
|
||||
global _model, _tokenizer, _loaded
|
||||
import torch
|
||||
|
||||
if _model is not None:
|
||||
del _model
|
||||
_model = None
|
||||
if _tokenizer is not None:
|
||||
del _tokenizer
|
||||
_tokenizer = None
|
||||
_loaded = False
|
||||
torch.cuda.empty_cache()
|
||||
logger.info("Orpheus model unloaded")
|
||||
|
||||
|
||||
async def synthesize_orpheus(
|
||||
text: str,
|
||||
voice: str = DEFAULT_VOICE,
|
||||
temperature: float = 0.6,
|
||||
top_p: float = 0.95,
|
||||
max_new_tokens: int = 4096,
|
||||
) -> OrpheusResult:
|
||||
"""
|
||||
Synthesize German speech using Orpheus TTS.
|
||||
|
||||
Returns OrpheusResult with audio as numpy float32 array.
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(
|
||||
None,
|
||||
_synthesize_sync,
|
||||
text,
|
||||
voice,
|
||||
temperature,
|
||||
top_p,
|
||||
max_new_tokens,
|
||||
)
|
||||
|
||||
|
||||
def _synthesize_sync(
|
||||
text: str,
|
||||
voice: str,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
max_new_tokens: int,
|
||||
) -> OrpheusResult:
|
||||
"""Synchronous synthesis (runs in thread pool)."""
|
||||
import torch
|
||||
|
||||
model, tokenizer = get_orpheus_model()
|
||||
|
||||
# Orpheus uses a specific prompt format with speaker tags
|
||||
prompt = f"<|speaker:{voice}|>{text}"
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
do_sample=True,
|
||||
)
|
||||
|
||||
# Extract audio tokens (model-specific decoding)
|
||||
audio_tokens = outputs[0][inputs["input_ids"].shape[1]:]
|
||||
|
||||
# Decode audio tokens to waveform
|
||||
# Orpheus uses a SNAC-based codec — tokens map to audio via the model's decode method
|
||||
if hasattr(model, "decode_audio"):
|
||||
audio_np = model.decode_audio(audio_tokens).cpu().numpy().flatten()
|
||||
else:
|
||||
# Fallback: use the tokenizer's decode if model doesn't have decode_audio
|
||||
# This handles different Orpheus model versions
|
||||
audio_np = _decode_orpheus_tokens(audio_tokens, model)
|
||||
|
||||
duration = len(audio_np) / SAMPLE_RATE
|
||||
|
||||
return OrpheusResult(
|
||||
audio=audio_np,
|
||||
sample_rate=SAMPLE_RATE,
|
||||
duration=duration,
|
||||
voice=voice,
|
||||
)
|
||||
|
||||
|
||||
def _decode_orpheus_tokens(tokens, model) -> np.ndarray:
|
||||
"""
|
||||
Decode Orpheus audio tokens using SNAC codec.
|
||||
|
||||
Orpheus generates special audio tokens that need to be decoded
|
||||
through the SNAC vocoder to produce the final waveform.
|
||||
"""
|
||||
import torch
|
||||
|
||||
try:
|
||||
from snac import SNAC
|
||||
|
||||
snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(model.device)
|
||||
|
||||
# Filter to audio-only tokens (above text vocab range)
|
||||
audio_token_ids = tokens[tokens >= 128256].tolist()
|
||||
|
||||
if not audio_token_ids:
|
||||
logger.warning("No audio tokens generated")
|
||||
return np.zeros(SAMPLE_RATE, dtype=np.float32) # 1s silence
|
||||
|
||||
# Orpheus interleaves 3 codebook levels: [c1, c2, c3, c1, c2, c3, ...]
|
||||
# Redistribute into separate codebook tensors
|
||||
codes_0, codes_1, codes_2 = [], [], []
|
||||
for i, token_id in enumerate(audio_token_ids):
|
||||
# Offset tokens back to codebook range
|
||||
code = token_id - 128256
|
||||
level = i % 3
|
||||
if level == 0:
|
||||
codes_0.append(code)
|
||||
elif level == 1:
|
||||
codes_1.append(code)
|
||||
else:
|
||||
codes_2.append(code)
|
||||
|
||||
# Trim to equal lengths
|
||||
min_len = min(len(codes_0), len(codes_1), len(codes_2))
|
||||
if min_len == 0:
|
||||
return np.zeros(SAMPLE_RATE, dtype=np.float32)
|
||||
|
||||
codes = [
|
||||
torch.tensor(codes_0[:min_len], device=model.device).unsqueeze(0),
|
||||
torch.tensor(codes_1[:min_len], device=model.device).unsqueeze(0),
|
||||
torch.tensor(codes_2[:min_len], device=model.device).unsqueeze(0),
|
||||
]
|
||||
|
||||
with torch.no_grad():
|
||||
audio = snac.decode(codes).squeeze().cpu().numpy()
|
||||
|
||||
return audio.astype(np.float32)
|
||||
|
||||
except ImportError:
|
||||
logger.error("snac package not installed — pip install snac")
|
||||
raise RuntimeError("snac package required for Orpheus audio decoding")
|
||||
205
services/mana-tts/app/zonos_service.py
Normal file
205
services/mana-tts/app/zonos_service.py
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
"""
|
||||
Zonos TTS — Expressive multilingual speech synthesis by Zyphra.
|
||||
|
||||
Trained on 200k hours of speech data with explicit German support.
|
||||
Fine-grained control over pitch, speaking rate, and emotions.
|
||||
|
||||
Model: Zyphra/Zonos-v0.1-transformer (HuggingFace)
|
||||
VRAM: ~5 GB (fits comfortably on RTX 3090)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Disable torch.compile (requires MSVC cl.exe on Windows which we don't have)
|
||||
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Lazy-loaded model state
|
||||
_model = None
|
||||
_loaded = False
|
||||
|
||||
MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
|
||||
SAMPLE_RATE = 44100 # Zonos outputs 44.1 kHz audio
|
||||
|
||||
# Emotion presets for the interview context
|
||||
EMOTION_PRESETS = {
|
||||
"neutral": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # neutral dominant
|
||||
"friendly": [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5], # happiness + neutral
|
||||
"warm": [0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7], # slight warmth
|
||||
"curious": [0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7], # interested
|
||||
}
|
||||
|
||||
DEFAULT_EMOTION = "friendly"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ZonosResult:
|
||||
audio: np.ndarray
|
||||
sample_rate: int
|
||||
duration: float
|
||||
emotion: str
|
||||
|
||||
|
||||
def is_zonos_loaded() -> bool:
|
||||
return _loaded
|
||||
|
||||
|
||||
def get_zonos_model():
|
||||
"""Load the Zonos model (lazy, first call only)."""
|
||||
global _model, _loaded
|
||||
|
||||
if _loaded:
|
||||
return _model
|
||||
|
||||
logger.info(f"Loading Zonos model: {MODEL_ID}")
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
||||
# Zonos provides its own loader
|
||||
# Try the official zonos package first, fall back to transformers
|
||||
try:
|
||||
from zonos.model import Zonos
|
||||
|
||||
_model = Zonos.from_pretrained(MODEL_ID, device="cuda")
|
||||
except ImportError:
|
||||
# If zonos package not installed, use transformers
|
||||
logger.info("zonos package not found, trying transformers loading")
|
||||
from transformers import AutoModel
|
||||
|
||||
_model = AutoModel.from_pretrained(
|
||||
MODEL_ID,
|
||||
torch_dtype=torch.float32,
|
||||
trust_remote_code=True,
|
||||
).to("cuda")
|
||||
|
||||
_loaded = True
|
||||
logger.info("Zonos model loaded successfully")
|
||||
return _model
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load Zonos model: {e}")
|
||||
raise RuntimeError(f"Failed to load Zonos model: {e}")
|
||||
|
||||
|
||||
def unload_zonos():
|
||||
"""Free VRAM by unloading the model."""
|
||||
global _model, _loaded
|
||||
import torch
|
||||
|
||||
if _model is not None:
|
||||
del _model
|
||||
_model = None
|
||||
_loaded = False
|
||||
torch.cuda.empty_cache()
|
||||
logger.info("Zonos model unloaded")
|
||||
|
||||
|
||||
async def synthesize_zonos(
|
||||
text: str,
|
||||
language: str = "de",
|
||||
emotion: str = DEFAULT_EMOTION,
|
||||
speaking_rate: float = 13.0,
|
||||
pitch_std: float = 20.0,
|
||||
speaker_audio: Optional[bytes] = None,
|
||||
) -> ZonosResult:
|
||||
"""
|
||||
Synthesize speech using Zonos TTS.
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
language: Language code (default: 'de' for German)
|
||||
emotion: Emotion preset name or custom emotion vector
|
||||
speaking_rate: Speaking rate in phonemes/sec (default 13.0, range ~8-20)
|
||||
pitch_std: Pitch variation in Hz (default 20.0, range ~5-50)
|
||||
speaker_audio: Optional reference audio bytes for voice cloning
|
||||
|
||||
Returns ZonosResult with audio as numpy float32 array.
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(
|
||||
None,
|
||||
_synthesize_sync,
|
||||
text,
|
||||
language,
|
||||
emotion,
|
||||
speaking_rate,
|
||||
pitch_std,
|
||||
speaker_audio,
|
||||
)
|
||||
|
||||
|
||||
def _synthesize_sync(
|
||||
text: str,
|
||||
language: str,
|
||||
emotion: str,
|
||||
speaking_rate: float,
|
||||
pitch_std: float,
|
||||
speaker_audio: Optional[bytes],
|
||||
) -> ZonosResult:
|
||||
"""Synchronous synthesis (runs in thread pool)."""
|
||||
import torch
|
||||
from zonos.conditioning import make_cond_dict
|
||||
|
||||
model = get_zonos_model()
|
||||
|
||||
# Resolve emotion preset
|
||||
emotion_values = EMOTION_PRESETS.get(emotion, EMOTION_PRESETS["friendly"])
|
||||
|
||||
# Build speaker embedding if reference audio provided
|
||||
speaker_embedding = None
|
||||
if speaker_audio:
|
||||
speaker_embedding = _embed_speaker(speaker_audio, model)
|
||||
|
||||
# Map language codes: Zonos expects espeak language codes like 'de' or 'en-us'
|
||||
lang_map = {"de": "de", "en": "en-us", "fr": "fr-fr", "es": "es", "it": "it"}
|
||||
espeak_lang = lang_map.get(language, language)
|
||||
|
||||
# Build conditioning using Zonos's own helper
|
||||
cond = make_cond_dict(
|
||||
text=text,
|
||||
language=espeak_lang,
|
||||
emotion=emotion_values,
|
||||
speaking_rate=speaking_rate,
|
||||
pitch_std=pitch_std,
|
||||
speaker=speaker_embedding,
|
||||
)
|
||||
|
||||
# Generate
|
||||
with torch.no_grad():
|
||||
conditioning = model.prepare_conditioning(cond)
|
||||
codes = model.generate(conditioning)
|
||||
audio = model.autoencoder.decode(codes).squeeze().cpu().numpy()
|
||||
|
||||
audio = audio.astype(np.float32)
|
||||
duration = len(audio) / SAMPLE_RATE
|
||||
|
||||
return ZonosResult(
|
||||
audio=audio,
|
||||
sample_rate=SAMPLE_RATE,
|
||||
duration=duration,
|
||||
emotion=emotion,
|
||||
)
|
||||
|
||||
|
||||
def _embed_speaker(audio_bytes: bytes, model) -> "torch.Tensor":
|
||||
"""Create speaker embedding from reference audio bytes."""
|
||||
import torch
|
||||
import io
|
||||
import soundfile as sf
|
||||
|
||||
audio_data, sr = sf.read(io.BytesIO(audio_bytes))
|
||||
|
||||
if len(audio_data.shape) > 1:
|
||||
audio_data = audio_data.mean(axis=1) # mono
|
||||
|
||||
audio_tensor = torch.tensor(audio_data, dtype=torch.float32, device="cuda").unsqueeze(0)
|
||||
|
||||
return model.make_speaker_embedding(audio_tensor, sr)
|
||||
|
|
@ -23,3 +23,13 @@ aiofiles>=24.1.0
|
|||
|
||||
# External Auth (mana-core-auth integration)
|
||||
httpx>=0.27.0
|
||||
|
||||
# ── Orpheus TTS (German high-quality) ──
|
||||
# Uses transformers + SNAC codec for audio decoding
|
||||
transformers>=4.44.0
|
||||
snac>=1.2.0
|
||||
torch>=2.1.0
|
||||
|
||||
# ── Zonos TTS (expressive multilingual by Zyphra) ──
|
||||
# Install via: pip install git+https://github.com/Zyphra/Zonos.git
|
||||
# (the 'zonos' package pulls its own deps including torch, encodec, etc.)
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue