feat(profile): voice interview with pre-rendered TTS audio + Orpheus/Zonos backends

Voice-based interview for the profile module — users choose between text,
voice (question read aloud + mic for answer), or conversation mode (fully
automatic flow with auto-save).

Interview audio:
- 92 pre-rendered MP3 files (23 questions × 4 voices) via Edge TTS
- Voices: Seraphina (DE-f), Florian (DE-m), Leni (CH-f), Jan (CH-m)
- User picks voice via dropdown, persisted in localStorage
- Web Speech API fallback for missing audio files

Profile UI:
- Interview hero block on overview with 3 start modes (text/voice/conversation)
- Voice/conversation toggle + voice picker in interview view
- Mic button on text/textarea/tags inputs for per-question voice input
- Conversation mode: auto-save + auto-advance after STT transcription
- Recording/transcribing/speaking state indicators

mana-tts service:
- New Orpheus TTS backend (German finetune, SNAC codec)
- New Zonos TTS backend (Zyphra, 200k hours, emotion control)
- Endpoints: POST /synthesize/orpheus, POST /synthesize/zonos
- espeak-ng installed on GPU server for Zonos phonemizer
- Compare script for side-by-side voice quality testing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-17 15:22:52 +02:00
parent 786ffd771b
commit 8823cc0bf0
101 changed files with 1597 additions and 31 deletions

View file

@ -2,7 +2,7 @@
Context Interview — Guided question flow that populates userContext.
-->
<script lang="ts">
import { onMount } from 'svelte';
import { onMount, onDestroy } from 'svelte';
import { useUserContext } from './queries';
import { userContextStore } from './stores/user-context.svelte';
import {
@ -11,14 +11,19 @@
getProgress,
type ContextCategory,
type ContextQuestion,
type QuestionInputType,
} from './questions';
import { useInterviewTts, VOICES } from './use-interview-tts.svelte';
import { useLocalStt } from '$lib/components/voice/use-local-stt.svelte';
interface Props {
limitCategories?: ContextCategory[];
compact?: boolean;
/** If set, auto-start this voice level on mount. */
initialVoiceLevel?: 'voice' | 'conversation';
}
let { limitCategories, compact = false }: Props = $props();
let { limitCategories, compact = false, initialVoiceLevel }: Props = $props();
let ctx$ = useUserContext();
let ctx = $derived(ctx$.value);
@ -29,8 +34,27 @@
let saving = $state(false);
let tagInput = $state('');
// ── Voice mode ──────────────────────────────────────
// 'off' = text only, 'voice' = TTS+STT per question, 'conversation' = auto-save + auto-advance
type VoiceLevel = 'off' | 'voice' | 'conversation';
const tts = useInterviewTts();
const stt = useLocalStt({ language: 'de' });
let voiceLevel = $state<VoiceLevel>('off');
let voiceMode = $derived(voiceLevel !== 'off');
let conversationMode = $derived(voiceLevel === 'conversation');
let voiceFlowActive = $state(false);
const VOICE_INPUT_TYPES: QuestionInputType[] = ['text', 'textarea', 'tags'];
onMount(() => {
void userContextStore.ensureDoc();
if (initialVoiceLevel) {
voiceLevel = initialVoiceLevel;
}
});
onDestroy(() => {
tts.stop();
if (stt.state === 'recording') stt.cancel();
});
let categories = $derived(
@ -40,6 +64,9 @@
let currentQuestion = $derived(
categoryQuestions[currentQuestionIdx] as ContextQuestion | undefined
);
let currentSupportsVoice = $derived(
currentQuestion ? VOICE_INPUT_TYPES.includes(currentQuestion.inputType) : false
);
let progress = $derived(getProgress(ctx?.interview?.answeredIds ?? []));
let answeredSet = $derived(new Set(ctx?.interview?.answeredIds ?? []));
let categoryProgress = $derived.by(() => {
@ -71,10 +98,85 @@
}
function selectCategory(key: ContextCategory) {
cancelVoiceFlow();
activeCategory = key;
currentQuestionIdx = 0;
}
// ── Voice flow: TTS → STT → fill input ──────────────
async function runVoiceFlow() {
if (!currentQuestion || !currentSupportsVoice) return;
voiceFlowActive = true;
// Step 1: Play pre-rendered question audio (falls back to Web Speech API)
await tts.speak(currentQuestion.id, currentQuestion.question);
// Step 2: Start mic recording (STT)
if (!voiceFlowActive) return; // cancelled during TTS
stt.toggle(); // starts recording
}
// Watch STT text — when transcription completes, fill the input.
// In conversation mode: auto-save + auto-advance to next question.
$effect(() => {
if (stt.state === 'idle' && stt.text && voiceFlowActive) {
applyVoiceTranscript(stt.text);
voiceFlowActive = false;
if (conversationMode) {
// Auto-save and advance after a brief pause so the user sees the transcript
setTimeout(() => handleAnswer(), 600);
}
}
});
// Auto-start voice flow when question changes in voice mode.
// Track only the question id to avoid re-triggering when ctx data updates.
let prevVoiceQuestionId = $state('');
$effect(() => {
const qid = currentQuestion?.id ?? '';
const shouldRun = voiceMode && currentSupportsVoice && qid && qid !== prevVoiceQuestionId;
if (shouldRun) {
prevVoiceQuestionId = qid;
const timeout = setTimeout(() => runVoiceFlow(), 300);
return () => clearTimeout(timeout);
}
});
function applyVoiceTranscript(transcript: string) {
if (!currentQuestion) return;
if (currentQuestion.inputType === 'tags') {
// Split transcript into tags by comma, "und", or line breaks
const parts = transcript
.split(/[,\n]|\bund\b/i)
.map((s) => s.trim())
.filter(Boolean);
const current = Array.isArray(inputValue) ? (inputValue as string[]) : [];
const merged = [...current];
for (const part of parts) {
if (!merged.includes(part)) merged.push(part);
}
inputValue = merged;
} else {
// text / textarea — replace content
inputValue = transcript;
}
}
function toggleMicForCurrentQuestion() {
if (stt.state === 'recording') {
stt.toggle(); // stop → transcribe
} else if (stt.state === 'idle') {
voiceFlowActive = true;
stt.toggle(); // start recording
}
}
function cancelVoiceFlow() {
voiceFlowActive = false;
tts.stop();
if (stt.state === 'recording') stt.cancel();
}
async function handleAnswer() {
if (!currentQuestion) return;
saving = true;
@ -94,6 +196,7 @@
}
function advanceQuestion() {
cancelVoiceFlow();
if (currentQuestionIdx < categoryQuestions.length - 1) {
currentQuestionIdx++;
} else {
@ -150,7 +253,73 @@
<div class="progress-bar">
<div class="progress-fill" style:width="{progress.percent}%"></div>
</div>
<p class="progress-text">{progress.answered} von {progress.total} Fragen beantwortet</p>
<div class="progress-row">
<p class="progress-text">{progress.answered} von {progress.total} Fragen beantwortet</p>
{#if tts.isSupported}
<div class="voice-controls">
<div class="voice-toggles">
<button
class="voice-toggle"
class:active={voiceLevel === 'voice'}
onclick={() => {
voiceLevel = voiceLevel === 'voice' ? 'off' : 'voice';
if (voiceLevel === 'off') cancelVoiceFlow();
}}
title="Voice-Modus: Fragen werden vorgelesen, Antworten per Sprache"
>
<svg
width="14"
height="14"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="2"
stroke-linecap="round"
stroke-linejoin="round"
>
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
</svg>
<span>Voice</span>
</button>
<button
class="voice-toggle"
class:active={voiceLevel === 'conversation'}
onclick={() => {
voiceLevel = voiceLevel === 'conversation' ? 'off' : 'conversation';
if (voiceLevel === 'off') cancelVoiceFlow();
}}
title="Gesprächs-Modus: Fließendes Interview — Antworten werden automatisch gespeichert"
>
<svg
width="14"
height="14"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="2"
stroke-linecap="round"
stroke-linejoin="round"
>
<path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z"></path>
</svg>
<span>Gespräch</span>
</button>
</div>
{#if voiceMode}
<select
class="voice-picker"
value={tts.voice}
onchange={(e) => tts.setVoice(e.currentTarget.value as any)}
>
{#each VOICES as v (v.key)}
<option value={v.key}>{v.label}</option>
{/each}
</select>
{/if}
</div>
{/if}
</div>
{/if}
<div class="categories">
@ -167,29 +336,133 @@
{/each}
</div>
{#if conversationMode}
<div class="conversation-banner">
<span>Gesprächs-Modus aktiv — Antworten werden automatisch gespeichert</span>
<button
class="banner-stop"
onclick={() => {
voiceLevel = 'off';
cancelVoiceFlow();
}}>Beenden</button
>
</div>
{/if}
{#if currentQuestion}
<div class="question-card">
<h3 class="question-text">{currentQuestion.question}</h3>
<div class="question-header">
<h3 class="question-text">{currentQuestion.question}</h3>
{#if tts.speaking}
<span class="voice-indicator speaking" title="Liest vor...">
<svg
width="18"
height="18"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="2"
stroke-linecap="round"
stroke-linejoin="round"
>
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5"></polygon>
<path d="M15.54 8.46a5 5 0 0 1 0 7.07"></path>
</svg>
</span>
{/if}
</div>
{#if currentQuestion.hint}<p class="question-hint">{currentQuestion.hint}</p>{/if}
{#if stt.state === 'recording'}
<div class="voice-status recording">
<span class="rec-dot"></span>
Aufnahme läuft... ({Math.floor(stt.elapsedMs / 1000)}s)
<button class="voice-stop-btn" onclick={() => stt.toggle()}>Stopp</button>
</div>
{:else if stt.state === 'transcribing'}
<div class="voice-status transcribing">
<span class="spinner-small"></span>
Transkribiere...
</div>
{:else if stt.state === 'loading'}
<div class="voice-status loading">
<span class="spinner-small"></span>
Lade Sprachmodell...
</div>
{/if}
<div class="input-area">
{#if currentQuestion.inputType === 'text'}
<input
type="text"
class="text-input"
bind:value={inputValue}
placeholder={currentQuestion.hint ?? ''}
disabled={saving}
onkeydown={(e) => e.key === 'Enter' && handleAnswer()}
/>
<div class="input-with-mic">
<input
type="text"
class="text-input"
bind:value={inputValue}
placeholder={currentQuestion.hint ?? ''}
disabled={saving}
onkeydown={(e) => e.key === 'Enter' && handleAnswer()}
/>
{#if stt.isSupported}
<button
class="mic-btn"
class:recording={stt.state === 'recording'}
onclick={toggleMicForCurrentQuestion}
disabled={saving || stt.state === 'transcribing' || stt.state === 'loading'}
title={stt.state === 'recording' ? 'Aufnahme stoppen' : 'Per Sprache antworten'}
>
<svg
width="16"
height="16"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="2"
stroke-linecap="round"
stroke-linejoin="round"
>
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
</svg>
</button>
{/if}
</div>
{:else if currentQuestion.inputType === 'textarea'}
<textarea
class="textarea-input"
bind:value={inputValue}
placeholder={currentQuestion.hint ?? ''}
disabled={saving}
rows="3"
></textarea>
<div class="textarea-with-mic">
<textarea
class="textarea-input"
bind:value={inputValue}
placeholder={currentQuestion.hint ?? ''}
disabled={saving}
rows="3"
></textarea>
{#if stt.isSupported}
<button
class="mic-btn textarea-mic"
class:recording={stt.state === 'recording'}
onclick={toggleMicForCurrentQuestion}
disabled={saving || stt.state === 'transcribing' || stt.state === 'loading'}
title={stt.state === 'recording' ? 'Aufnahme stoppen' : 'Per Sprache antworten'}
>
<svg
width="16"
height="16"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="2"
stroke-linecap="round"
stroke-linejoin="round"
>
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
</svg>
</button>
{/if}
</div>
{:else if currentQuestion.inputType === 'time'}
<input type="time" class="time-input" bind:value={inputValue} disabled={saving} />
{:else if currentQuestion.inputType === 'choice'}
@ -214,15 +487,42 @@
>{/each}
</div>
{/if}
<input
type="text"
class="text-input"
bind:value={tagInput}
placeholder={currentQuestion.hint ?? 'Eingabe + Enter'}
disabled={saving}
onkeydown={handleTagKeydown}
onblur={addTag}
/>
<div class="input-with-mic">
<input
type="text"
class="text-input"
bind:value={tagInput}
placeholder={currentQuestion.hint ?? 'Eingabe + Enter'}
disabled={saving}
onkeydown={handleTagKeydown}
onblur={addTag}
/>
{#if stt.isSupported}
<button
class="mic-btn"
class:recording={stt.state === 'recording'}
onclick={toggleMicForCurrentQuestion}
disabled={saving || stt.state === 'transcribing' || stt.state === 'loading'}
title={stt.state === 'recording' ? 'Aufnahme stoppen' : 'Per Sprache antworten'}
>
<svg
width="16"
height="16"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="2"
stroke-linecap="round"
stroke-linejoin="round"
>
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
</svg>
</button>
{/if}
</div>
</div>
{:else if currentQuestion.inputType === 'weekdays'}
<div class="weekdays">
@ -564,4 +864,217 @@
border: none;
padding: 0;
}
/* ── Voice mode ────────────────────────────── */
.progress-row {
display: flex;
align-items: center;
justify-content: space-between;
gap: 0.5rem;
}
.voice-controls {
display: flex;
align-items: center;
gap: 0.375rem;
}
.voice-toggles {
display: flex;
gap: 0.25rem;
}
.voice-picker {
padding: 0.25rem 0.5rem;
border: 1px solid hsl(var(--color-border));
border-radius: 999px;
background: transparent;
color: hsl(var(--color-foreground));
font-size: 0.6875rem;
outline: none;
cursor: pointer;
}
.voice-picker:focus {
border-color: hsl(var(--color-primary));
}
.voice-toggle {
display: inline-flex;
align-items: center;
gap: 0.375rem;
padding: 0.25rem 0.625rem;
border: 1px solid hsl(var(--color-border));
border-radius: 999px;
background: transparent;
color: hsl(var(--color-muted-foreground));
font-size: 0.6875rem;
cursor: pointer;
transition:
background 0.15s,
border-color 0.15s,
color 0.15s;
white-space: nowrap;
}
.voice-toggle:hover {
background: hsl(var(--color-surface-hover));
}
.voice-toggle.active {
background: hsl(var(--color-primary) / 0.1);
border-color: hsl(var(--color-primary));
color: hsl(var(--color-primary));
}
.question-header {
display: flex;
align-items: flex-start;
gap: 0.5rem;
}
.question-header .question-text {
flex: 1;
}
.voice-indicator {
flex-shrink: 0;
display: flex;
align-items: center;
color: hsl(var(--color-primary));
}
.voice-indicator.speaking {
animation: pulse-voice 1s ease-in-out infinite;
}
@keyframes pulse-voice {
0%,
100% {
opacity: 1;
}
50% {
opacity: 0.4;
}
}
.voice-status {
display: flex;
align-items: center;
gap: 0.5rem;
padding: 0.5rem 0.75rem;
border-radius: 0.5rem;
font-size: 0.8125rem;
}
.voice-status.recording {
background: hsl(0 70% 50% / 0.08);
color: hsl(0 70% 45%);
}
.voice-status.transcribing,
.voice-status.loading {
background: hsl(var(--color-primary) / 0.08);
color: hsl(var(--color-primary));
}
.rec-dot {
width: 0.5rem;
height: 0.5rem;
border-radius: 50%;
background: hsl(0 70% 50%);
animation: pulse-rec 1s ease-in-out infinite;
}
@keyframes pulse-rec {
0%,
100% {
opacity: 1;
transform: scale(1);
}
50% {
opacity: 0.5;
transform: scale(1.3);
}
}
.voice-stop-btn {
margin-left: auto;
padding: 0.25rem 0.625rem;
border: 1px solid currentColor;
border-radius: 999px;
background: transparent;
color: inherit;
font-size: 0.75rem;
cursor: pointer;
}
.spinner-small {
width: 0.875rem;
height: 0.875rem;
border: 2px solid currentColor;
border-top-color: transparent;
border-radius: 50%;
animation: spin 0.6s linear infinite;
}
@keyframes spin {
to {
transform: rotate(360deg);
}
}
.input-with-mic {
display: flex;
gap: 0.375rem;
align-items: center;
}
.input-with-mic .text-input {
flex: 1;
}
.textarea-with-mic {
position: relative;
}
.textarea-with-mic .textarea-input {
width: 100%;
}
.mic-btn {
display: flex;
align-items: center;
justify-content: center;
width: 2.25rem;
height: 2.25rem;
border: 1px solid hsl(var(--color-border));
border-radius: 0.5rem;
background: transparent;
color: hsl(var(--color-muted-foreground));
cursor: pointer;
flex-shrink: 0;
transition:
background 0.15s,
border-color 0.15s,
color 0.15s;
}
.mic-btn:hover:not(:disabled) {
background: hsl(var(--color-surface-hover));
color: hsl(var(--color-foreground));
}
.mic-btn.recording {
background: hsl(0 70% 50% / 0.1);
border-color: hsl(0 70% 50%);
color: hsl(0 70% 45%);
animation: pulse-rec 1s ease-in-out infinite;
}
.mic-btn:disabled {
opacity: 0.4;
cursor: not-allowed;
}
.textarea-mic {
position: absolute;
right: 0.375rem;
bottom: 0.375rem;
}
.conversation-banner {
display: flex;
align-items: center;
justify-content: space-between;
gap: 0.5rem;
padding: 0.5rem 0.75rem;
border-radius: 0.5rem;
background: hsl(var(--color-primary) / 0.08);
color: hsl(var(--color-primary));
font-size: 0.75rem;
}
.banner-stop {
padding: 0.25rem 0.625rem;
border: 1px solid hsl(var(--color-primary) / 0.3);
border-radius: 999px;
background: transparent;
color: hsl(var(--color-primary));
font-size: 0.6875rem;
cursor: pointer;
white-space: nowrap;
}
.banner-stop:hover {
background: hsl(var(--color-primary) / 0.1);
}
</style>

View file

@ -15,17 +15,25 @@
import ContextOverview from './ContextOverview.svelte';
import ContextInterview from './ContextInterview.svelte';
import ContextFreeform from './ContextFreeform.svelte';
import { useUserContext } from './queries';
import { getProgress } from './questions';
type Tab = 'overview' | 'interview' | 'freeform' | 'account';
type InterviewStartMode = 'text' | 'voice' | 'conversation';
let apiProfile = $state<ApiUserProfile | null>(null);
let loading = $state(true);
let activeTab = $state<Tab>('overview');
let interviewStartMode = $state<InterviewStartMode | null>(null);
let showEditModal = $state(false);
let showPasswordModal = $state(false);
let showDeleteModal = $state(false);
let ctx$ = useUserContext();
let ctx = $derived(ctx$.value);
let progress = $derived(getProgress(ctx?.interview?.answeredIds ?? []));
onMount(async () => {
try {
apiProfile = await profileService.getProfile();
@ -43,6 +51,11 @@
{ key: 'account', label: 'Konto' },
];
function startInterview(mode: InterviewStartMode) {
interviewStartMode = mode;
activeTab = 'interview';
}
function handleProfileUpdate(user: ApiUserProfile) {
apiProfile = user;
toast.success('Profil erfolgreich aktualisiert');
@ -71,7 +84,10 @@
<button
class="tab-btn"
class:active={activeTab === tab.key}
onclick={() => (activeTab = tab.key)}
onclick={() => {
activeTab = tab.key;
if (tab.key !== 'interview') interviewStartMode = null;
}}
>
{tab.label}
</button>
@ -81,9 +97,99 @@
<!-- Tab content -->
<div class="tab-content">
{#if activeTab === 'overview'}
<ContextOverview user={apiProfile} onStartInterview={() => (activeTab = 'interview')} />
<ContextOverview user={apiProfile} onStartInterview={() => startInterview('text')} />
<!-- Interview start hero -->
<div class="interview-hero">
<div class="hero-header">
<h3 class="hero-title">Interview starten</h3>
<p class="hero-subtitle">
{#if progress.percent > 0}
{progress.answered} von {progress.total} Fragen beantwortet — mach weiter!
{:else}
Erzähl Mana mehr über dich, damit die App besser zu dir passt.
{/if}
</p>
{#if progress.percent > 0}
<div class="hero-progress">
<div class="hero-progress-fill" style:width="{progress.percent}%"></div>
</div>
{/if}
</div>
<div class="hero-options">
<button class="hero-option" onclick={() => startInterview('text')}>
<span class="hero-option-icon">
<svg
width="24"
height="24"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="1.5"
stroke-linecap="round"
stroke-linejoin="round"
>
<path d="M17 3a2.85 2.83 0 1 1 4 4L7.5 20.5 2 22l1.5-5.5Z"></path>
</svg>
</span>
<span class="hero-option-text">
<strong>Per Text</strong>
<span>Fragen lesen und tippen</span>
</span>
</button>
<button class="hero-option voice" onclick={() => startInterview('voice')}>
<span class="hero-option-icon">
<svg
width="24"
height="24"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="1.5"
stroke-linecap="round"
stroke-linejoin="round"
>
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
<line x1="12" y1="19" x2="12" y2="23"></line>
<line x1="8" y1="23" x2="16" y2="23"></line>
</svg>
</span>
<span class="hero-option-text">
<strong>Per Sprache</strong>
<span>Fragen hören und sprechen</span>
</span>
</button>
<button class="hero-option conversation" onclick={() => startInterview('conversation')}>
<span class="hero-option-icon">
<svg
width="24"
height="24"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
stroke-width="1.5"
stroke-linecap="round"
stroke-linejoin="round"
>
<path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z"></path>
</svg>
</span>
<span class="hero-option-text">
<strong>Als Gespräch</strong>
<span>Fließend — Antworten werden automatisch gespeichert</span>
</span>
</button>
</div>
</div>
{:else if activeTab === 'interview'}
<ContextInterview />
<ContextInterview
initialVoiceLevel={interviewStartMode === 'conversation'
? 'conversation'
: interviewStartMode === 'voice'
? 'voice'
: undefined}
/>
{:else if activeTab === 'freeform'}
<ContextFreeform />
{:else if activeTab === 'account'}
@ -280,4 +386,95 @@
.account-btn.danger:hover {
background: hsl(var(--color-destructive, 0 84% 60%) / 0.08);
}
/* ── Interview hero ──────────────────────── */
.interview-hero {
margin-top: 1rem;
border: 1px solid hsl(var(--color-border));
border-radius: 0.75rem;
background: hsl(var(--color-card));
overflow: hidden;
}
.hero-header {
padding: 1.25rem 1.25rem 1rem;
}
.hero-title {
margin: 0;
font-size: 1.0625rem;
font-weight: 600;
}
.hero-subtitle {
margin: 0.25rem 0 0;
font-size: 0.8125rem;
color: hsl(var(--color-muted-foreground));
}
.hero-progress {
height: 4px;
margin-top: 0.75rem;
background: hsl(var(--color-border));
border-radius: 2px;
overflow: hidden;
}
.hero-progress-fill {
height: 100%;
background: hsl(var(--color-primary));
border-radius: 2px;
transition: width 0.3s ease;
}
.hero-options {
display: flex;
flex-direction: column;
border-top: 1px solid hsl(var(--color-border));
}
.hero-option {
display: flex;
align-items: center;
gap: 0.875rem;
padding: 1rem 1.25rem;
border: none;
border-bottom: 1px solid hsl(var(--color-border));
background: transparent;
color: hsl(var(--color-foreground));
cursor: pointer;
text-align: left;
transition: background 0.15s;
}
.hero-option:last-child {
border-bottom: none;
}
.hero-option:hover {
background: hsl(var(--color-surface-hover));
}
.hero-option-icon {
display: flex;
align-items: center;
justify-content: center;
width: 2.5rem;
height: 2.5rem;
border-radius: 0.625rem;
background: hsl(var(--color-muted) / 0.5);
color: hsl(var(--color-muted-foreground));
flex-shrink: 0;
}
.hero-option.voice .hero-option-icon {
background: hsl(var(--color-primary) / 0.1);
color: hsl(var(--color-primary));
}
.hero-option.conversation .hero-option-icon {
background: hsl(142 71% 45% / 0.1);
color: hsl(142 71% 35%);
}
.hero-option-text {
display: flex;
flex-direction: column;
gap: 0.125rem;
}
.hero-option-text strong {
font-size: 0.875rem;
font-weight: 600;
}
.hero-option-text span {
font-size: 0.75rem;
color: hsl(var(--color-muted-foreground));
}
</style>

View file

@ -0,0 +1,159 @@
/**
* useInterviewTts() Plays pre-rendered interview question audio.
*
* Audio files live in /audio/interview/{voiceKey}/{questionId}.mp3
* where voiceKey is one of: de-f, de-m, ch-f, ch-m
*
* Falls back to Web Speech API if the audio file is missing.
*/
export type VoiceKey = 'de-f' | 'de-m' | 'ch-f' | 'ch-m';
export interface VoiceMeta {
key: VoiceKey;
label: string;
lang: string;
gender: string;
}
export const VOICES: VoiceMeta[] = [
{ key: 'de-f', label: 'Seraphina (DE)', lang: 'Deutsch', gender: 'Weiblich' },
{ key: 'de-m', label: 'Florian (DE)', lang: 'Deutsch', gender: 'Männlich' },
{ key: 'ch-f', label: 'Leni (CH)', lang: 'Schweizerdeutsch', gender: 'Weiblich' },
{ key: 'ch-m', label: 'Jan (CH)', lang: 'Schweizerdeutsch', gender: 'Männlich' },
];
const STORAGE_KEY = 'mana.interview.voice';
const DEFAULT_VOICE: VoiceKey = 'de-f';
export interface InterviewTtsHandle {
/** Whether audio is currently playing */
readonly speaking: boolean;
/** Always true — we have pre-rendered audio */
readonly isSupported: boolean;
/** Currently selected voice */
readonly voice: VoiceKey;
/** Set the voice */
setVoice: (key: VoiceKey) => void;
/** Play the audio for a question. Resolves when done. */
speak: (questionId: string, fallbackText?: string) => Promise<void>;
/** Stop playback immediately. */
stop: () => void;
}
export function useInterviewTts(): InterviewTtsHandle {
let speaking = $state(false);
let voice = $state<VoiceKey>(loadVoice());
let currentAudio: HTMLAudioElement | null = null;
function loadVoice(): VoiceKey {
if (typeof window === 'undefined') return DEFAULT_VOICE;
const stored = localStorage.getItem(STORAGE_KEY);
if (stored && VOICES.some((v) => v.key === stored)) return stored as VoiceKey;
return DEFAULT_VOICE;
}
function setVoice(key: VoiceKey) {
voice = key;
if (typeof window !== 'undefined') {
localStorage.setItem(STORAGE_KEY, key);
}
}
function speak(questionId: string, fallbackText?: string): Promise<void> {
stop();
const audioUrl = `/audio/interview/${voice}/${questionId}.mp3`;
return new Promise<void>((resolve) => {
const audio = new Audio(audioUrl);
currentAudio = audio;
audio.addEventListener(
'canplaythrough',
() => {
speaking = true;
audio.play().catch(() => {
// Autoplay blocked — try Web Speech API fallback
speaking = false;
if (fallbackText) {
speakFallback(fallbackText).then(resolve);
} else {
resolve();
}
});
},
{ once: true }
);
audio.addEventListener(
'ended',
() => {
speaking = false;
currentAudio = null;
resolve();
},
{ once: true }
);
audio.addEventListener(
'error',
() => {
// File not found — fallback to Web Speech API
speaking = false;
currentAudio = null;
if (fallbackText) {
speakFallback(fallbackText).then(resolve);
} else {
resolve();
}
},
{ once: true }
);
audio.load();
});
}
function stop() {
if (currentAudio) {
currentAudio.pause();
currentAudio.src = '';
currentAudio = null;
}
speaking = false;
}
return {
get speaking() {
return speaking;
},
get isSupported() {
return true;
},
get voice() {
return voice;
},
setVoice,
speak,
stop,
};
}
/** Web Speech API fallback for missing audio files. */
function speakFallback(text: string): Promise<void> {
if (typeof window === 'undefined' || !('speechSynthesis' in window)) {
return Promise.resolve();
}
speechSynthesis.cancel();
return new Promise<void>((resolve) => {
const utterance = new SpeechSynthesisUtterance(text);
utterance.lang = 'de-DE';
utterance.rate = 0.92;
utterance.onend = () => resolve();
utterance.onerror = () => resolve();
speechSynthesis.speak(utterance);
});
}

View file

@ -16,6 +16,8 @@ Text-to-Speech microservice. Wraps Kokoro (English presets), Piper (German, loca
| **Framework** | FastAPI |
| **English (preset)** | Kokoro-82M (`kokoro_service.py`) |
| **German (local)** | Piper ONNX with `kerstin_low.onnx` and `thorsten_medium.onnx` voices (`piper_service.py`) |
| **German (high-quality)** | Orpheus-3B German finetune (`orpheus_service.py`) — best for pre-generation |
| **Multilingual (expressive)** | Zonos v0.1 by Zyphra (`zonos_service.py`) — emotion control, 200k hours training |
| **Voice cloning** | F5-TTS on CUDA (`f5_service.py`) |
| **Audio I/O** | `soundfile`, `pydub` |
| **Auth** | Per-key + internal-key API auth (`auth.py`) + JWT via mana-auth (`external_auth.py`) |
@ -43,6 +45,8 @@ Public URL: `https://gpu-tts.mana.how`.
| DELETE | `/voices/{voice_id}` | Delete a custom voice |
| POST | `/synthesize/kokoro` | Kokoro synthesis (English presets) |
| POST | `/synthesize` | F5-TTS voice cloning |
| POST | `/synthesize/orpheus` | Orpheus synthesis (German, high-quality, pre-generation) |
| POST | `/synthesize/zonos` | Zonos synthesis (multilingual, expressive, emotion control) |
| POST | `/synthesize/auto` | Routing helper — picks the right backend for the requested voice |
All non-health endpoints require `Authorization: Bearer <token>` (per-app key, internal key, or mana-auth JWT).
@ -59,6 +63,12 @@ All non-health endpoints require `Authorization: Bearer <token>` (per-app key, i
Fallback to Edge TTS cloud voices if Piper isn't loaded.
### Orpheus-3B German (high-quality pre-generation)
~8 GB VRAM. German finetune (`Kartoffel/Orpheus-3B_german_natural-v0.1`). Natural intonation, built-in speaker voices (tara, leo, emma, ...). Best quality for pre-generating static audio files. Not real-time.
### Zonos v0.1 (expressive multilingual)
~5 GB VRAM. By Zyphra, trained on 200k hours. Explicit German support. Fine-grained control: emotion (neutral/friendly/warm/curious), speaking rate, pitch variation. Can clone voices from 5s reference audio.
### F5-TTS (voice cloning)
~6 GB. Requires reference audio + transcript. Higher quality, slower. Custom voices live in `voices/` (reference audio + transcript per voice ID).
@ -84,6 +94,8 @@ services/mana-tts/
│ ├── kokoro_service.py # Kokoro (English presets)
│ ├── piper_service.py # Piper (German, local ONNX)
│ ├── f5_service.py # F5-TTS (voice cloning, CUDA)
│ ├── orpheus_service.py # Orpheus-3B German (high-quality)
│ ├── zonos_service.py # Zonos v0.1 (expressive multilingual)
│ ├── voice_manager.py # Custom voice registry
│ ├── audio_utils.py # Format conversion, resampling
│ ├── auth.py # API-key auth

View file

@ -42,6 +42,17 @@ from .piper_service import (
PIPER_VOICES,
is_piper_loaded,
)
from .orpheus_service import (
synthesize_orpheus,
is_orpheus_loaded,
ORPHEUS_VOICES,
DEFAULT_VOICE as DEFAULT_ORPHEUS_VOICE,
)
from .zonos_service import (
synthesize_zonos,
is_zonos_loaded,
EMOTION_PRESETS as ZONOS_EMOTIONS,
)
# Configure logging
logging.basicConfig(
@ -203,6 +214,8 @@ async def health_check():
models_loaded={
"kokoro": is_kokoro_loaded(),
"f5": is_f5_loaded(),
"orpheus": is_orpheus_loaded(),
"zonos": is_zonos_loaded(),
},
auth_required=REQUIRE_AUTH,
)
@ -528,6 +541,160 @@ async def synthesize_with_f5(
cleanup_temp_file(temp_file_path)
# ============================================================================
# Orpheus TTS Endpoint (German, high-quality)
# ============================================================================
class OrpheusRequest(BaseModel):
"""Request for Orpheus TTS synthesis."""
text: str = Field(..., description="Text to synthesize (German)", max_length=5000)
voice: str = Field(DEFAULT_ORPHEUS_VOICE, description="Speaker voice")
output_format: str = Field("wav", description="Output format (wav, mp3)")
temperature: float = Field(0.6, ge=0.1, le=1.5, description="Sampling temperature")
@app.post("/synthesize/orpheus")
async def synthesize_with_orpheus(
request: OrpheusRequest,
auth: AuthResult = Depends(verify_api_key),
):
"""
Synthesize German speech using Orpheus TTS.
High-quality German synthesis with natural intonation.
Not optimized for real-time designed for pre-generation.
"""
if not request.text.strip():
raise HTTPException(status_code=400, detail="Text cannot be empty")
if len(request.text) > MAX_TEXT_LENGTH:
raise HTTPException(
status_code=400,
detail=f"Text exceeds maximum length of {MAX_TEXT_LENGTH} characters",
)
output_format = request.output_format.lower()
if output_format not in SUPPORTED_FORMATS:
raise HTTPException(
status_code=400,
detail=f"Unsupported format. Use one of: {SUPPORTED_FORMATS}",
)
try:
result = await synthesize_orpheus(
text=request.text,
voice=request.voice,
temperature=request.temperature,
)
audio_bytes, content_type = convert_audio(
result.audio,
result.sample_rate,
output_format,
)
return Response(
content=audio_bytes,
media_type=content_type,
headers={
"X-Model": "orpheus-german",
"X-Voice": result.voice,
"X-Duration": str(result.duration),
"X-Sample-Rate": str(result.sample_rate),
},
)
except RuntimeError as e:
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
logger.error(f"Orpheus synthesis error: {e}")
raise HTTPException(status_code=500, detail=f"Orpheus synthesis failed: {e}")
# ============================================================================
# Zonos TTS Endpoint (Multilingual, expressive)
# ============================================================================
class ZonosRequest(BaseModel):
"""Request for Zonos TTS synthesis."""
text: str = Field(..., description="Text to synthesize", max_length=5000)
language: str = Field("de", description="Language code")
emotion: str = Field("friendly", description="Emotion preset: neutral, friendly, warm, curious")
speaking_rate: float = Field(13.0, ge=5.0, le=25.0, description="Phonemes per second")
pitch_std: float = Field(20.0, ge=5.0, le=50.0, description="Pitch variation in Hz")
output_format: str = Field("wav", description="Output format (wav, mp3)")
@app.post("/synthesize/zonos")
async def synthesize_with_zonos(
request: ZonosRequest,
auth: AuthResult = Depends(verify_api_key),
):
"""
Synthesize speech using Zonos TTS by Zyphra.
Expressive multilingual synthesis with emotion control.
Trained on 200k hours explicit German support.
"""
if not request.text.strip():
raise HTTPException(status_code=400, detail="Text cannot be empty")
if len(request.text) > MAX_TEXT_LENGTH:
raise HTTPException(
status_code=400,
detail=f"Text exceeds maximum length of {MAX_TEXT_LENGTH} characters",
)
output_format = request.output_format.lower()
if output_format not in SUPPORTED_FORMATS:
raise HTTPException(
status_code=400,
detail=f"Unsupported format. Use one of: {SUPPORTED_FORMATS}",
)
if request.emotion not in ZONOS_EMOTIONS:
raise HTTPException(
status_code=400,
detail=f"Unknown emotion. Use one of: {list(ZONOS_EMOTIONS.keys())}",
)
try:
result = await synthesize_zonos(
text=request.text,
language=request.language,
emotion=request.emotion,
speaking_rate=request.speaking_rate,
pitch_std=request.pitch_std,
)
audio_bytes, content_type = convert_audio(
result.audio,
result.sample_rate,
output_format,
)
return Response(
content=audio_bytes,
media_type=content_type,
headers={
"X-Model": "zonos-v0.1",
"X-Emotion": result.emotion,
"X-Duration": str(result.duration),
"X-Sample-Rate": str(result.sample_rate),
},
)
except RuntimeError as e:
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
logger.error(f"Zonos synthesis error: {e}")
raise HTTPException(status_code=500, detail=f"Zonos synthesis failed: {e}")
# ============================================================================
# Auto-Selection Endpoint
# ============================================================================

View file

@ -0,0 +1,229 @@
"""
Orpheus TTS High-quality German speech synthesis.
Uses the Orpheus-TTS model with German finetune for natural-sounding
interview question generation. Not optimized for real-time quality first.
Model: Kartoffel_Orpheus-3B_german_natural-v0.1 (HuggingFace)
VRAM: ~8 GB (fits comfortably on RTX 3090 alongside other models)
"""
import logging
import asyncio
from dataclasses import dataclass
from typing import Optional
import numpy as np
logger = logging.getLogger(__name__)
# Lazy-loaded model state
_model = None
_tokenizer = None
_loaded = False
MODEL_ID = "Vishalshendge3198/orpheus-3b-tts-german-emotional-merged"
SAMPLE_RATE = 24000
# Available voices (Orpheus built-in speaker tags)
ORPHEUS_VOICES = {
"tara": "Female, warm and clear (default)",
"leah": "Female, soft and friendly",
"jess": "Female, energetic",
"leo": "Male, calm and professional",
"dan": "Male, deep and warm",
"mia": "Female, young and bright",
"zac": "Male, confident",
"emma": "Female, neutral",
}
DEFAULT_VOICE = "tara"
@dataclass
class OrpheusResult:
audio: np.ndarray
sample_rate: int
duration: float
voice: str
def is_orpheus_loaded() -> bool:
return _loaded
def get_orpheus_model():
"""Load the Orpheus German model (lazy, first call only)."""
global _model, _tokenizer, _loaded
if _loaded:
return _model, _tokenizer
logger.info(f"Loading Orpheus German model: {MODEL_ID}")
try:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
_tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
trust_remote_code=True,
)
_model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="cuda",
trust_remote_code=True,
)
_model.eval()
_loaded = True
logger.info("Orpheus German model loaded successfully")
return _model, _tokenizer
except Exception as e:
logger.error(f"Failed to load Orpheus model: {e}")
raise RuntimeError(f"Failed to load Orpheus model: {e}")
def unload_orpheus():
"""Free VRAM by unloading the model."""
global _model, _tokenizer, _loaded
import torch
if _model is not None:
del _model
_model = None
if _tokenizer is not None:
del _tokenizer
_tokenizer = None
_loaded = False
torch.cuda.empty_cache()
logger.info("Orpheus model unloaded")
async def synthesize_orpheus(
text: str,
voice: str = DEFAULT_VOICE,
temperature: float = 0.6,
top_p: float = 0.95,
max_new_tokens: int = 4096,
) -> OrpheusResult:
"""
Synthesize German speech using Orpheus TTS.
Returns OrpheusResult with audio as numpy float32 array.
"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
_synthesize_sync,
text,
voice,
temperature,
top_p,
max_new_tokens,
)
def _synthesize_sync(
text: str,
voice: str,
temperature: float,
top_p: float,
max_new_tokens: int,
) -> OrpheusResult:
"""Synchronous synthesis (runs in thread pool)."""
import torch
model, tokenizer = get_orpheus_model()
# Orpheus uses a specific prompt format with speaker tags
prompt = f"<|speaker:{voice}|>{text}"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
)
# Extract audio tokens (model-specific decoding)
audio_tokens = outputs[0][inputs["input_ids"].shape[1]:]
# Decode audio tokens to waveform
# Orpheus uses a SNAC-based codec — tokens map to audio via the model's decode method
if hasattr(model, "decode_audio"):
audio_np = model.decode_audio(audio_tokens).cpu().numpy().flatten()
else:
# Fallback: use the tokenizer's decode if model doesn't have decode_audio
# This handles different Orpheus model versions
audio_np = _decode_orpheus_tokens(audio_tokens, model)
duration = len(audio_np) / SAMPLE_RATE
return OrpheusResult(
audio=audio_np,
sample_rate=SAMPLE_RATE,
duration=duration,
voice=voice,
)
def _decode_orpheus_tokens(tokens, model) -> np.ndarray:
"""
Decode Orpheus audio tokens using SNAC codec.
Orpheus generates special audio tokens that need to be decoded
through the SNAC vocoder to produce the final waveform.
"""
import torch
try:
from snac import SNAC
snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(model.device)
# Filter to audio-only tokens (above text vocab range)
audio_token_ids = tokens[tokens >= 128256].tolist()
if not audio_token_ids:
logger.warning("No audio tokens generated")
return np.zeros(SAMPLE_RATE, dtype=np.float32) # 1s silence
# Orpheus interleaves 3 codebook levels: [c1, c2, c3, c1, c2, c3, ...]
# Redistribute into separate codebook tensors
codes_0, codes_1, codes_2 = [], [], []
for i, token_id in enumerate(audio_token_ids):
# Offset tokens back to codebook range
code = token_id - 128256
level = i % 3
if level == 0:
codes_0.append(code)
elif level == 1:
codes_1.append(code)
else:
codes_2.append(code)
# Trim to equal lengths
min_len = min(len(codes_0), len(codes_1), len(codes_2))
if min_len == 0:
return np.zeros(SAMPLE_RATE, dtype=np.float32)
codes = [
torch.tensor(codes_0[:min_len], device=model.device).unsqueeze(0),
torch.tensor(codes_1[:min_len], device=model.device).unsqueeze(0),
torch.tensor(codes_2[:min_len], device=model.device).unsqueeze(0),
]
with torch.no_grad():
audio = snac.decode(codes).squeeze().cpu().numpy()
return audio.astype(np.float32)
except ImportError:
logger.error("snac package not installed — pip install snac")
raise RuntimeError("snac package required for Orpheus audio decoding")

View file

@ -0,0 +1,205 @@
"""
Zonos TTS Expressive multilingual speech synthesis by Zyphra.
Trained on 200k hours of speech data with explicit German support.
Fine-grained control over pitch, speaking rate, and emotions.
Model: Zyphra/Zonos-v0.1-transformer (HuggingFace)
VRAM: ~5 GB (fits comfortably on RTX 3090)
"""
import logging
import asyncio
import os
from dataclasses import dataclass
from typing import Optional
import numpy as np
# Disable torch.compile (requires MSVC cl.exe on Windows which we don't have)
os.environ["TORCHDYNAMO_DISABLE"] = "1"
logger = logging.getLogger(__name__)
# Lazy-loaded model state
_model = None
_loaded = False
MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
SAMPLE_RATE = 44100 # Zonos outputs 44.1 kHz audio
# Emotion presets for the interview context
EMOTION_PRESETS = {
"neutral": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # neutral dominant
"friendly": [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5], # happiness + neutral
"warm": [0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7], # slight warmth
"curious": [0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7], # interested
}
DEFAULT_EMOTION = "friendly"
@dataclass
class ZonosResult:
audio: np.ndarray
sample_rate: int
duration: float
emotion: str
def is_zonos_loaded() -> bool:
return _loaded
def get_zonos_model():
"""Load the Zonos model (lazy, first call only)."""
global _model, _loaded
if _loaded:
return _model
logger.info(f"Loading Zonos model: {MODEL_ID}")
try:
import torch
# Zonos provides its own loader
# Try the official zonos package first, fall back to transformers
try:
from zonos.model import Zonos
_model = Zonos.from_pretrained(MODEL_ID, device="cuda")
except ImportError:
# If zonos package not installed, use transformers
logger.info("zonos package not found, trying transformers loading")
from transformers import AutoModel
_model = AutoModel.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
trust_remote_code=True,
).to("cuda")
_loaded = True
logger.info("Zonos model loaded successfully")
return _model
except Exception as e:
logger.error(f"Failed to load Zonos model: {e}")
raise RuntimeError(f"Failed to load Zonos model: {e}")
def unload_zonos():
"""Free VRAM by unloading the model."""
global _model, _loaded
import torch
if _model is not None:
del _model
_model = None
_loaded = False
torch.cuda.empty_cache()
logger.info("Zonos model unloaded")
async def synthesize_zonos(
text: str,
language: str = "de",
emotion: str = DEFAULT_EMOTION,
speaking_rate: float = 13.0,
pitch_std: float = 20.0,
speaker_audio: Optional[bytes] = None,
) -> ZonosResult:
"""
Synthesize speech using Zonos TTS.
Args:
text: Text to synthesize
language: Language code (default: 'de' for German)
emotion: Emotion preset name or custom emotion vector
speaking_rate: Speaking rate in phonemes/sec (default 13.0, range ~8-20)
pitch_std: Pitch variation in Hz (default 20.0, range ~5-50)
speaker_audio: Optional reference audio bytes for voice cloning
Returns ZonosResult with audio as numpy float32 array.
"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
_synthesize_sync,
text,
language,
emotion,
speaking_rate,
pitch_std,
speaker_audio,
)
def _synthesize_sync(
text: str,
language: str,
emotion: str,
speaking_rate: float,
pitch_std: float,
speaker_audio: Optional[bytes],
) -> ZonosResult:
"""Synchronous synthesis (runs in thread pool)."""
import torch
from zonos.conditioning import make_cond_dict
model = get_zonos_model()
# Resolve emotion preset
emotion_values = EMOTION_PRESETS.get(emotion, EMOTION_PRESETS["friendly"])
# Build speaker embedding if reference audio provided
speaker_embedding = None
if speaker_audio:
speaker_embedding = _embed_speaker(speaker_audio, model)
# Map language codes: Zonos expects espeak language codes like 'de' or 'en-us'
lang_map = {"de": "de", "en": "en-us", "fr": "fr-fr", "es": "es", "it": "it"}
espeak_lang = lang_map.get(language, language)
# Build conditioning using Zonos's own helper
cond = make_cond_dict(
text=text,
language=espeak_lang,
emotion=emotion_values,
speaking_rate=speaking_rate,
pitch_std=pitch_std,
speaker=speaker_embedding,
)
# Generate
with torch.no_grad():
conditioning = model.prepare_conditioning(cond)
codes = model.generate(conditioning)
audio = model.autoencoder.decode(codes).squeeze().cpu().numpy()
audio = audio.astype(np.float32)
duration = len(audio) / SAMPLE_RATE
return ZonosResult(
audio=audio,
sample_rate=SAMPLE_RATE,
duration=duration,
emotion=emotion,
)
def _embed_speaker(audio_bytes: bytes, model) -> "torch.Tensor":
"""Create speaker embedding from reference audio bytes."""
import torch
import io
import soundfile as sf
audio_data, sr = sf.read(io.BytesIO(audio_bytes))
if len(audio_data.shape) > 1:
audio_data = audio_data.mean(axis=1) # mono
audio_tensor = torch.tensor(audio_data, dtype=torch.float32, device="cuda").unsqueeze(0)
return model.make_speaker_embedding(audio_tensor, sr)

View file

@ -23,3 +23,13 @@ aiofiles>=24.1.0
# External Auth (mana-core-auth integration)
httpx>=0.27.0
# ── Orpheus TTS (German high-quality) ──
# Uses transformers + SNAC codec for audio decoding
transformers>=4.44.0
snac>=1.2.0
torch>=2.1.0
# ── Zonos TTS (expressive multilingual by Zyphra) ──
# Install via: pip install git+https://github.com/Zyphra/Zonos.git
# (the 'zonos' package pulls its own deps including torch, encodec, etc.)

Some files were not shown because too many files have changed in this diff Show more