feat(profile): voice interview with pre-rendered TTS audio + Orpheus/Zonos backends

Voice-based interview for the profile module — users choose between text, voice (question read aloud + mic for answer), or conversation mode (fully automatic flow with auto-save). Interview audio: - 92 pre-rendered MP3 files (23 questions × 4 voices) via Edge TTS - Voices: Seraphina (DE-f), Florian (DE-m), Leni (CH-f), Jan (CH-m) - User picks voice via dropdown, persisted in localStorage - Web Speech API fallback for missing audio files Profile UI: - Interview hero block on overview with 3 start modes (text/voice/conversation) - Voice/conversation toggle + voice picker in interview view - Mic button on text/textarea/tags inputs for per-question voice input - Conversation mode: auto-save + auto-advance after STT transcription - Recording/transcribing/speaking state indicators mana-tts service: - New Orpheus TTS backend (German finetune, SNAC codec) - New Zonos TTS backend (Zyphra, 200k hours, emotion control) - Endpoints: POST /synthesize/orpheus, POST /synthesize/zonos - espeak-ng installed on GPU server for Zonos phonemizer - Compare script for side-by-side voice quality testing Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 18:41:08 +02:00 · 2026-04-17 15:22:52 +02:00 · 2026-04-17 15:22:52 +02:00 · 8823cc0bf0
commit 8823cc0bf0
parent 786ffd771b
101 changed files with 1597 additions and 31 deletions
--- a/apps/mana/apps/web/src/lib/modules/profile/ContextInterview.svelte
+++ b/apps/mana/apps/web/src/lib/modules/profile/ContextInterview.svelte
@ -2,7 +2,7 @@
  Context Interview — Guided question flow that populates userContext.
 -->
 <script lang="ts">
-	import { onMount } from 'svelte';
+	import { onMount, onDestroy } from 'svelte';
 	import { useUserContext } from './queries';
 	import { userContextStore } from './stores/user-context.svelte';
 	import {
@ -11,14 +11,19 @@
 		getProgress,
 		type ContextCategory,
 		type ContextQuestion,
+		type QuestionInputType,
 	} from './questions';
+	import { useInterviewTts, VOICES } from './use-interview-tts.svelte';
+	import { useLocalStt } from '$lib/components/voice/use-local-stt.svelte';

 	interface Props {
 		limitCategories?: ContextCategory[];
 		compact?: boolean;
+		/** If set, auto-start this voice level on mount. */
+		initialVoiceLevel?: 'voice' | 'conversation';
 	}

-	let { limitCategories, compact = false }: Props = $props();
+	let { limitCategories, compact = false, initialVoiceLevel }: Props = $props();

 	let ctx$ = useUserContext();
 	let ctx = $derived(ctx$.value);
@ -29,8 +34,27 @@
 	let saving = $state(false);
 	let tagInput = $state('');

+	// ── Voice mode ──────────────────────────────────────
+	// 'off' = text only, 'voice' = TTS+STT per question, 'conversation' = auto-save + auto-advance
+	type VoiceLevel = 'off' | 'voice' | 'conversation';
+	const tts = useInterviewTts();
+	const stt = useLocalStt({ language: 'de' });
+	let voiceLevel = $state<VoiceLevel>('off');
+	let voiceMode = $derived(voiceLevel !== 'off');
+	let conversationMode = $derived(voiceLevel === 'conversation');
+	let voiceFlowActive = $state(false);
+	const VOICE_INPUT_TYPES: QuestionInputType[] = ['text', 'textarea', 'tags'];
+
 	onMount(() => {
 		void userContextStore.ensureDoc();
+		if (initialVoiceLevel) {
+			voiceLevel = initialVoiceLevel;
+		}
+	});
+
+	onDestroy(() => {
+		tts.stop();
+		if (stt.state === 'recording') stt.cancel();
 	});

 	let categories = $derived(
@ -40,6 +64,9 @@
 	let currentQuestion = $derived(
 		categoryQuestions[currentQuestionIdx] as ContextQuestion | undefined
 	);
+	let currentSupportsVoice = $derived(
+		currentQuestion ? VOICE_INPUT_TYPES.includes(currentQuestion.inputType) : false
+	);
 	let progress = $derived(getProgress(ctx?.interview?.answeredIds ?? []));
 	let answeredSet = $derived(new Set(ctx?.interview?.answeredIds ?? []));
 	let categoryProgress = $derived.by(() => {
@ -71,10 +98,85 @@
 	}

 	function selectCategory(key: ContextCategory) {
+		cancelVoiceFlow();
 		activeCategory = key;
 		currentQuestionIdx = 0;
 	}

+	// ── Voice flow: TTS → STT → fill input ──────────────
+	async function runVoiceFlow() {
+		if (!currentQuestion || !currentSupportsVoice) return;
+		voiceFlowActive = true;
+
+		// Step 1: Play pre-rendered question audio (falls back to Web Speech API)
+		await tts.speak(currentQuestion.id, currentQuestion.question);
+
+		// Step 2: Start mic recording (STT)
+		if (!voiceFlowActive) return; // cancelled during TTS
+		stt.toggle(); // starts recording
+	}
+
+	// Watch STT text — when transcription completes, fill the input.
+	// In conversation mode: auto-save + auto-advance to next question.
+	$effect(() => {
+		if (stt.state === 'idle' && stt.text && voiceFlowActive) {
+			applyVoiceTranscript(stt.text);
+			voiceFlowActive = false;
+			if (conversationMode) {
+				// Auto-save and advance after a brief pause so the user sees the transcript
+				setTimeout(() => handleAnswer(), 600);
+			}
+		}
+	});
+
+	// Auto-start voice flow when question changes in voice mode.
+	// Track only the question id to avoid re-triggering when ctx data updates.
+	let prevVoiceQuestionId = $state('');
+	$effect(() => {
+		const qid = currentQuestion?.id ?? '';
+		const shouldRun = voiceMode && currentSupportsVoice && qid && qid !== prevVoiceQuestionId;
+		if (shouldRun) {
+			prevVoiceQuestionId = qid;
+			const timeout = setTimeout(() => runVoiceFlow(), 300);
+			return () => clearTimeout(timeout);
+		}
+	});
+
+	function applyVoiceTranscript(transcript: string) {
+		if (!currentQuestion) return;
+		if (currentQuestion.inputType === 'tags') {
+			// Split transcript into tags by comma, "und", or line breaks
+			const parts = transcript
+				.split(/[,\n]|\bund\b/i)
+				.map((s) => s.trim())
+				.filter(Boolean);
+			const current = Array.isArray(inputValue) ? (inputValue as string[]) : [];
+			const merged = [...current];
+			for (const part of parts) {
+				if (!merged.includes(part)) merged.push(part);
+			}
+			inputValue = merged;
+		} else {
+			// text / textarea — replace content
+			inputValue = transcript;
+		}
+	}
+
+	function toggleMicForCurrentQuestion() {
+		if (stt.state === 'recording') {
+			stt.toggle(); // stop → transcribe
+		} else if (stt.state === 'idle') {
+			voiceFlowActive = true;
+			stt.toggle(); // start recording
+		}
+	}
+
+	function cancelVoiceFlow() {
+		voiceFlowActive = false;
+		tts.stop();
+		if (stt.state === 'recording') stt.cancel();
+	}
+
 	async function handleAnswer() {
 		if (!currentQuestion) return;
 		saving = true;
@ -94,6 +196,7 @@
 	}

 	function advanceQuestion() {
+		cancelVoiceFlow();
 		if (currentQuestionIdx < categoryQuestions.length - 1) {
 			currentQuestionIdx++;
 		} else {
@ -150,7 +253,73 @@
 		<div class="progress-bar">
 			<div class="progress-fill" style:width="{progress.percent}%"></div>
 		</div>
-		<p class="progress-text">{progress.answered} von {progress.total} Fragen beantwortet</p>
+		<div class="progress-row">
+			<p class="progress-text">{progress.answered} von {progress.total} Fragen beantwortet</p>
+			{#if tts.isSupported}
+				<div class="voice-controls">
+					<div class="voice-toggles">
+						<button
+							class="voice-toggle"
+							class:active={voiceLevel === 'voice'}
+							onclick={() => {
+								voiceLevel = voiceLevel === 'voice' ? 'off' : 'voice';
+								if (voiceLevel === 'off') cancelVoiceFlow();
+							}}
+							title="Voice-Modus: Fragen werden vorgelesen, Antworten per Sprache"
+						>
+							<svg
+								width="14"
+								height="14"
+								viewBox="0 0 24 24"
+								fill="none"
+								stroke="currentColor"
+								stroke-width="2"
+								stroke-linecap="round"
+								stroke-linejoin="round"
+							>
+								<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
+								<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
+							</svg>
+							<span>Voice</span>
+						</button>
+						<button
+							class="voice-toggle"
+							class:active={voiceLevel === 'conversation'}
+							onclick={() => {
+								voiceLevel = voiceLevel === 'conversation' ? 'off' : 'conversation';
+								if (voiceLevel === 'off') cancelVoiceFlow();
+							}}
+							title="Gesprächs-Modus: Fließendes Interview — Antworten werden automatisch gespeichert"
+						>
+							<svg
+								width="14"
+								height="14"
+								viewBox="0 0 24 24"
+								fill="none"
+								stroke="currentColor"
+								stroke-width="2"
+								stroke-linecap="round"
+								stroke-linejoin="round"
+							>
+								<path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z"></path>
+							</svg>
+							<span>Gespräch</span>
+						</button>
+					</div>
+					{#if voiceMode}
+						<select
+							class="voice-picker"
+							value={tts.voice}
+							onchange={(e) => tts.setVoice(e.currentTarget.value as any)}
+						>
+							{#each VOICES as v (v.key)}
+								<option value={v.key}>{v.label}</option>
+							{/each}
+						</select>
+					{/if}
+				</div>
+			{/if}
+		</div>
 	{/if}

 	<div class="categories">
@ -167,29 +336,133 @@
 		{/each}
 	</div>

+	{#if conversationMode}
+		<div class="conversation-banner">
+			<span>Gesprächs-Modus aktiv — Antworten werden automatisch gespeichert</span>
+			<button
+				class="banner-stop"
+				onclick={() => {
+					voiceLevel = 'off';
+					cancelVoiceFlow();
+				}}>Beenden</button
+			>
+		</div>
+	{/if}
+
 	{#if currentQuestion}
 		<div class="question-card">
-			<h3 class="question-text">{currentQuestion.question}</h3>
+			<div class="question-header">
+				<h3 class="question-text">{currentQuestion.question}</h3>
+				{#if tts.speaking}
+					<span class="voice-indicator speaking" title="Liest vor...">
+						<svg
+							width="18"
+							height="18"
+							viewBox="0 0 24 24"
+							fill="none"
+							stroke="currentColor"
+							stroke-width="2"
+							stroke-linecap="round"
+							stroke-linejoin="round"
+						>
+							<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5"></polygon>
+							<path d="M15.54 8.46a5 5 0 0 1 0 7.07"></path>
+						</svg>
+					</span>
+				{/if}
+			</div>
 			{#if currentQuestion.hint}<p class="question-hint">{currentQuestion.hint}</p>{/if}

+			{#if stt.state === 'recording'}
+				<div class="voice-status recording">
+					<span class="rec-dot"></span>
+					Aufnahme läuft... ({Math.floor(stt.elapsedMs / 1000)}s)
+					<button class="voice-stop-btn" onclick={() => stt.toggle()}>Stopp</button>
+				</div>
+			{:else if stt.state === 'transcribing'}
+				<div class="voice-status transcribing">
+					<span class="spinner-small"></span>
+					Transkribiere...
+				</div>
+			{:else if stt.state === 'loading'}
+				<div class="voice-status loading">
+					<span class="spinner-small"></span>
+					Lade Sprachmodell...
+				</div>
+			{/if}
+
 			<div class="input-area">
 				{#if currentQuestion.inputType === 'text'}
-					<input
-						type="text"
-						class="text-input"
-						bind:value={inputValue}
-						placeholder={currentQuestion.hint ?? ''}
-						disabled={saving}
-						onkeydown={(e) => e.key === 'Enter' && handleAnswer()}
-					/>
+					<div class="input-with-mic">
+						<input
+							type="text"
+							class="text-input"
+							bind:value={inputValue}
+							placeholder={currentQuestion.hint ?? ''}
+							disabled={saving}
+							onkeydown={(e) => e.key === 'Enter' && handleAnswer()}
+						/>
+						{#if stt.isSupported}
+							<button
+								class="mic-btn"
+								class:recording={stt.state === 'recording'}
+								onclick={toggleMicForCurrentQuestion}
+								disabled={saving || stt.state === 'transcribing' || stt.state === 'loading'}
+								title={stt.state === 'recording' ? 'Aufnahme stoppen' : 'Per Sprache antworten'}
+							>
+								<svg
+									width="16"
+									height="16"
+									viewBox="0 0 24 24"
+									fill="none"
+									stroke="currentColor"
+									stroke-width="2"
+									stroke-linecap="round"
+									stroke-linejoin="round"
+								>
+									<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
+									<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
+									<line x1="12" y1="19" x2="12" y2="23"></line>
+									<line x1="8" y1="23" x2="16" y2="23"></line>
+								</svg>
+							</button>
+						{/if}
+					</div>
 				{:else if currentQuestion.inputType === 'textarea'}
-					<textarea
-						class="textarea-input"
-						bind:value={inputValue}
-						placeholder={currentQuestion.hint ?? ''}
-						disabled={saving}
-						rows="3"
-					></textarea>
+					<div class="textarea-with-mic">
+						<textarea
+							class="textarea-input"
+							bind:value={inputValue}
+							placeholder={currentQuestion.hint ?? ''}
+							disabled={saving}
+							rows="3"
+						></textarea>
+						{#if stt.isSupported}
+							<button
+								class="mic-btn textarea-mic"
+								class:recording={stt.state === 'recording'}
+								onclick={toggleMicForCurrentQuestion}
+								disabled={saving || stt.state === 'transcribing' || stt.state === 'loading'}
+								title={stt.state === 'recording' ? 'Aufnahme stoppen' : 'Per Sprache antworten'}
+							>
+								<svg
+									width="16"
+									height="16"
+									viewBox="0 0 24 24"
+									fill="none"
+									stroke="currentColor"
+									stroke-width="2"
+									stroke-linecap="round"
+									stroke-linejoin="round"
+								>
+									<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
+									<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
+									<line x1="12" y1="19" x2="12" y2="23"></line>
+									<line x1="8" y1="23" x2="16" y2="23"></line>
+								</svg>
+							</button>
+						{/if}
+					</div>
 				{:else if currentQuestion.inputType === 'time'}
 					<input type="time" class="time-input" bind:value={inputValue} disabled={saving} />
 				{:else if currentQuestion.inputType === 'choice'}
@ -214,15 +487,42 @@
 									>{/each}
 							</div>
 						{/if}
-						<input
-							type="text"
-							class="text-input"
-							bind:value={tagInput}
-							placeholder={currentQuestion.hint ?? 'Eingabe + Enter'}
-							disabled={saving}
-							onkeydown={handleTagKeydown}
-							onblur={addTag}
-						/>
+						<div class="input-with-mic">
+							<input
+								type="text"
+								class="text-input"
+								bind:value={tagInput}
+								placeholder={currentQuestion.hint ?? 'Eingabe + Enter'}
+								disabled={saving}
+								onkeydown={handleTagKeydown}
+								onblur={addTag}
+							/>
+							{#if stt.isSupported}
+								<button
+									class="mic-btn"
+									class:recording={stt.state === 'recording'}
+									onclick={toggleMicForCurrentQuestion}
+									disabled={saving || stt.state === 'transcribing' || stt.state === 'loading'}
+									title={stt.state === 'recording' ? 'Aufnahme stoppen' : 'Per Sprache antworten'}
+								>
+									<svg
+										width="16"
+										height="16"
+										viewBox="0 0 24 24"
+										fill="none"
+										stroke="currentColor"
+										stroke-width="2"
+										stroke-linecap="round"
+										stroke-linejoin="round"
+									>
+										<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
+										<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
+										<line x1="12" y1="19" x2="12" y2="23"></line>
+										<line x1="8" y1="23" x2="16" y2="23"></line>
+									</svg>
+								</button>
+							{/if}
+						</div>
 					</div>
 				{:else if currentQuestion.inputType === 'weekdays'}
 					<div class="weekdays">
@ -564,4 +864,217 @@
 		border: none;
 		padding: 0;
 	}
+
+	/* ── Voice mode ────────────────────────────── */
+	.progress-row {
+		display: flex;
+		align-items: center;
+		justify-content: space-between;
+		gap: 0.5rem;
+	}
+	.voice-controls {
+		display: flex;
+		align-items: center;
+		gap: 0.375rem;
+	}
+	.voice-toggles {
+		display: flex;
+		gap: 0.25rem;
+	}
+	.voice-picker {
+		padding: 0.25rem 0.5rem;
+		border: 1px solid hsl(var(--color-border));
+		border-radius: 999px;
+		background: transparent;
+		color: hsl(var(--color-foreground));
+		font-size: 0.6875rem;
+		outline: none;
+		cursor: pointer;
+	}
+	.voice-picker:focus {
+		border-color: hsl(var(--color-primary));
+	}
+	.voice-toggle {
+		display: inline-flex;
+		align-items: center;
+		gap: 0.375rem;
+		padding: 0.25rem 0.625rem;
+		border: 1px solid hsl(var(--color-border));
+		border-radius: 999px;
+		background: transparent;
+		color: hsl(var(--color-muted-foreground));
+		font-size: 0.6875rem;
+		cursor: pointer;
+		transition:
+			background 0.15s,
+			border-color 0.15s,
+			color 0.15s;
+		white-space: nowrap;
+	}
+	.voice-toggle:hover {
+		background: hsl(var(--color-surface-hover));
+	}
+	.voice-toggle.active {
+		background: hsl(var(--color-primary) / 0.1);
+		border-color: hsl(var(--color-primary));
+		color: hsl(var(--color-primary));
+	}
+	.question-header {
+		display: flex;
+		align-items: flex-start;
+		gap: 0.5rem;
+	}
+	.question-header .question-text {
+		flex: 1;
+	}
+	.voice-indicator {
+		flex-shrink: 0;
+		display: flex;
+		align-items: center;
+		color: hsl(var(--color-primary));
+	}
+	.voice-indicator.speaking {
+		animation: pulse-voice 1s ease-in-out infinite;
+	}
+	@keyframes pulse-voice {
+		0%,
+		100% {
+			opacity: 1;
+		}
+		50% {
+			opacity: 0.4;
+		}
+	}
+	.voice-status {
+		display: flex;
+		align-items: center;
+		gap: 0.5rem;
+		padding: 0.5rem 0.75rem;
+		border-radius: 0.5rem;
+		font-size: 0.8125rem;
+	}
+	.voice-status.recording {
+		background: hsl(0 70% 50% / 0.08);
+		color: hsl(0 70% 45%);
+	}
+	.voice-status.transcribing,
+	.voice-status.loading {
+		background: hsl(var(--color-primary) / 0.08);
+		color: hsl(var(--color-primary));
+	}
+	.rec-dot {
+		width: 0.5rem;
+		height: 0.5rem;
+		border-radius: 50%;
+		background: hsl(0 70% 50%);
+		animation: pulse-rec 1s ease-in-out infinite;
+	}
+	@keyframes pulse-rec {
+		0%,
+		100% {
+			opacity: 1;
+			transform: scale(1);
+		}
+		50% {
+			opacity: 0.5;
+			transform: scale(1.3);
+		}
+	}
+	.voice-stop-btn {
+		margin-left: auto;
+		padding: 0.25rem 0.625rem;
+		border: 1px solid currentColor;
+		border-radius: 999px;
+		background: transparent;
+		color: inherit;
+		font-size: 0.75rem;
+		cursor: pointer;
+	}
+	.spinner-small {
+		width: 0.875rem;
+		height: 0.875rem;
+		border: 2px solid currentColor;
+		border-top-color: transparent;
+		border-radius: 50%;
+		animation: spin 0.6s linear infinite;
+	}
+	@keyframes spin {
+		to {
+			transform: rotate(360deg);
+		}
+	}
+	.input-with-mic {
+		display: flex;
+		gap: 0.375rem;
+		align-items: center;
+	}
+	.input-with-mic .text-input {
+		flex: 1;
+	}
+	.textarea-with-mic {
+		position: relative;
+	}
+	.textarea-with-mic .textarea-input {
+		width: 100%;
+	}
+	.mic-btn {
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		width: 2.25rem;
+		height: 2.25rem;
+		border: 1px solid hsl(var(--color-border));
+		border-radius: 0.5rem;
+		background: transparent;
+		color: hsl(var(--color-muted-foreground));
+		cursor: pointer;
+		flex-shrink: 0;
+		transition:
+			background 0.15s,
+			border-color 0.15s,
+			color 0.15s;
+	}
+	.mic-btn:hover:not(:disabled) {
+		background: hsl(var(--color-surface-hover));
+		color: hsl(var(--color-foreground));
+	}
+	.mic-btn.recording {
+		background: hsl(0 70% 50% / 0.1);
+		border-color: hsl(0 70% 50%);
+		color: hsl(0 70% 45%);
+		animation: pulse-rec 1s ease-in-out infinite;
+	}
+	.mic-btn:disabled {
+		opacity: 0.4;
+		cursor: not-allowed;
+	}
+	.textarea-mic {
+		position: absolute;
+		right: 0.375rem;
+		bottom: 0.375rem;
+	}
+	.conversation-banner {
+		display: flex;
+		align-items: center;
+		justify-content: space-between;
+		gap: 0.5rem;
+		padding: 0.5rem 0.75rem;
+		border-radius: 0.5rem;
+		background: hsl(var(--color-primary) / 0.08);
+		color: hsl(var(--color-primary));
+		font-size: 0.75rem;
+	}
+	.banner-stop {
+		padding: 0.25rem 0.625rem;
+		border: 1px solid hsl(var(--color-primary) / 0.3);
+		border-radius: 999px;
+		background: transparent;
+		color: hsl(var(--color-primary));
+		font-size: 0.6875rem;
+		cursor: pointer;
+		white-space: nowrap;
+	}
+	.banner-stop:hover {
+		background: hsl(var(--color-primary) / 0.1);
+	}
 </style>
--- a/apps/mana/apps/web/src/lib/modules/profile/ListView.svelte
+++ b/apps/mana/apps/web/src/lib/modules/profile/ListView.svelte
@ -15,17 +15,25 @@
 	import ContextOverview from './ContextOverview.svelte';
 	import ContextInterview from './ContextInterview.svelte';
 	import ContextFreeform from './ContextFreeform.svelte';
+	import { useUserContext } from './queries';
+	import { getProgress } from './questions';

 	type Tab = 'overview' | 'interview' | 'freeform' | 'account';
+	type InterviewStartMode = 'text' | 'voice' | 'conversation';

 	let apiProfile = $state<ApiUserProfile | null>(null);
 	let loading = $state(true);
 	let activeTab = $state<Tab>('overview');
+	let interviewStartMode = $state<InterviewStartMode | null>(null);

 	let showEditModal = $state(false);
 	let showPasswordModal = $state(false);
 	let showDeleteModal = $state(false);

+	let ctx$ = useUserContext();
+	let ctx = $derived(ctx$.value);
+	let progress = $derived(getProgress(ctx?.interview?.answeredIds ?? []));
+
 	onMount(async () => {
 		try {
 			apiProfile = await profileService.getProfile();
@ -43,6 +51,11 @@
 		{ key: 'account', label: 'Konto' },
 	];

+	function startInterview(mode: InterviewStartMode) {
+		interviewStartMode = mode;
+		activeTab = 'interview';
+	}
+
 	function handleProfileUpdate(user: ApiUserProfile) {
 		apiProfile = user;
 		toast.success('Profil erfolgreich aktualisiert');
@ -71,7 +84,10 @@
 				<button
 					class="tab-btn"
 					class:active={activeTab === tab.key}
-					onclick={() => (activeTab = tab.key)}
+					onclick={() => {
+						activeTab = tab.key;
+						if (tab.key !== 'interview') interviewStartMode = null;
+					}}
 				>
 					{tab.label}
 				</button>
@ -81,9 +97,99 @@
 		<!-- Tab content -->
 		<div class="tab-content">
 			{#if activeTab === 'overview'}
-				<ContextOverview user={apiProfile} onStartInterview={() => (activeTab = 'interview')} />
+				<ContextOverview user={apiProfile} onStartInterview={() => startInterview('text')} />
+
+				<!-- Interview start hero -->
+				<div class="interview-hero">
+					<div class="hero-header">
+						<h3 class="hero-title">Interview starten</h3>
+						<p class="hero-subtitle">
+							{#if progress.percent > 0}
+								{progress.answered} von {progress.total} Fragen beantwortet — mach weiter!
+							{:else}
+								Erzähl Mana mehr über dich, damit die App besser zu dir passt.
+							{/if}
+						</p>
+						{#if progress.percent > 0}
+							<div class="hero-progress">
+								<div class="hero-progress-fill" style:width="{progress.percent}%"></div>
+							</div>
+						{/if}
+					</div>
+					<div class="hero-options">
+						<button class="hero-option" onclick={() => startInterview('text')}>
+							<span class="hero-option-icon">
+								<svg
+									width="24"
+									height="24"
+									viewBox="0 0 24 24"
+									fill="none"
+									stroke="currentColor"
+									stroke-width="1.5"
+									stroke-linecap="round"
+									stroke-linejoin="round"
+								>
+									<path d="M17 3a2.85 2.83 0 1 1 4 4L7.5 20.5 2 22l1.5-5.5Z"></path>
+								</svg>
+							</span>
+							<span class="hero-option-text">
+								<strong>Per Text</strong>
+								<span>Fragen lesen und tippen</span>
+							</span>
+						</button>
+						<button class="hero-option voice" onclick={() => startInterview('voice')}>
+							<span class="hero-option-icon">
+								<svg
+									width="24"
+									height="24"
+									viewBox="0 0 24 24"
+									fill="none"
+									stroke="currentColor"
+									stroke-width="1.5"
+									stroke-linecap="round"
+									stroke-linejoin="round"
+								>
+									<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
+									<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
+									<line x1="12" y1="19" x2="12" y2="23"></line>
+									<line x1="8" y1="23" x2="16" y2="23"></line>
+								</svg>
+							</span>
+							<span class="hero-option-text">
+								<strong>Per Sprache</strong>
+								<span>Fragen hören und sprechen</span>
+							</span>
+						</button>
+						<button class="hero-option conversation" onclick={() => startInterview('conversation')}>
+							<span class="hero-option-icon">
+								<svg
+									width="24"
+									height="24"
+									viewBox="0 0 24 24"
+									fill="none"
+									stroke="currentColor"
+									stroke-width="1.5"
+									stroke-linecap="round"
+									stroke-linejoin="round"
+								>
+									<path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z"></path>
+								</svg>
+							</span>
+							<span class="hero-option-text">
+								<strong>Als Gespräch</strong>
+								<span>Fließend — Antworten werden automatisch gespeichert</span>
+							</span>
+						</button>
+					</div>
+				</div>
 			{:else if activeTab === 'interview'}
-				<ContextInterview />
+				<ContextInterview
+					initialVoiceLevel={interviewStartMode === 'conversation'
+						? 'conversation'
+						: interviewStartMode === 'voice'
+							? 'voice'
+							: undefined}
+				/>
 			{:else if activeTab === 'freeform'}
 				<ContextFreeform />
 			{:else if activeTab === 'account'}
@ -280,4 +386,95 @@
 	.account-btn.danger:hover {
 		background: hsl(var(--color-destructive, 0 84% 60%) / 0.08);
 	}
+
+	/* ── Interview hero ──────────────────────── */
+	.interview-hero {
+		margin-top: 1rem;
+		border: 1px solid hsl(var(--color-border));
+		border-radius: 0.75rem;
+		background: hsl(var(--color-card));
+		overflow: hidden;
+	}
+	.hero-header {
+		padding: 1.25rem 1.25rem 1rem;
+	}
+	.hero-title {
+		margin: 0;
+		font-size: 1.0625rem;
+		font-weight: 600;
+	}
+	.hero-subtitle {
+		margin: 0.25rem 0 0;
+		font-size: 0.8125rem;
+		color: hsl(var(--color-muted-foreground));
+	}
+	.hero-progress {
+		height: 4px;
+		margin-top: 0.75rem;
+		background: hsl(var(--color-border));
+		border-radius: 2px;
+		overflow: hidden;
+	}
+	.hero-progress-fill {
+		height: 100%;
+		background: hsl(var(--color-primary));
+		border-radius: 2px;
+		transition: width 0.3s ease;
+	}
+	.hero-options {
+		display: flex;
+		flex-direction: column;
+		border-top: 1px solid hsl(var(--color-border));
+	}
+	.hero-option {
+		display: flex;
+		align-items: center;
+		gap: 0.875rem;
+		padding: 1rem 1.25rem;
+		border: none;
+		border-bottom: 1px solid hsl(var(--color-border));
+		background: transparent;
+		color: hsl(var(--color-foreground));
+		cursor: pointer;
+		text-align: left;
+		transition: background 0.15s;
+	}
+	.hero-option:last-child {
+		border-bottom: none;
+	}
+	.hero-option:hover {
+		background: hsl(var(--color-surface-hover));
+	}
+	.hero-option-icon {
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		width: 2.5rem;
+		height: 2.5rem;
+		border-radius: 0.625rem;
+		background: hsl(var(--color-muted) / 0.5);
+		color: hsl(var(--color-muted-foreground));
+		flex-shrink: 0;
+	}
+	.hero-option.voice .hero-option-icon {
+		background: hsl(var(--color-primary) / 0.1);
+		color: hsl(var(--color-primary));
+	}
+	.hero-option.conversation .hero-option-icon {
+		background: hsl(142 71% 45% / 0.1);
+		color: hsl(142 71% 35%);
+	}
+	.hero-option-text {
+		display: flex;
+		flex-direction: column;
+		gap: 0.125rem;
+	}
+	.hero-option-text strong {
+		font-size: 0.875rem;
+		font-weight: 600;
+	}
+	.hero-option-text span {
+		font-size: 0.75rem;
+		color: hsl(var(--color-muted-foreground));
+	}
 </style>
--- a/apps/mana/apps/web/src/lib/modules/profile/use-interview-tts.svelte.ts
+++ b/apps/mana/apps/web/src/lib/modules/profile/use-interview-tts.svelte.ts
@ -0,0 +1,159 @@
+/**
+ * useInterviewTts() — Plays pre-rendered interview question audio.
+ *
+ * Audio files live in /audio/interview/{voiceKey}/{questionId}.mp3
+ * where voiceKey is one of: de-f, de-m, ch-f, ch-m
+ *
+ * Falls back to Web Speech API if the audio file is missing.
+ */
+
+export type VoiceKey = 'de-f' | 'de-m' | 'ch-f' | 'ch-m';
+
+export interface VoiceMeta {
+	key: VoiceKey;
+	label: string;
+	lang: string;
+	gender: string;
+}
+
+export const VOICES: VoiceMeta[] = [
+	{ key: 'de-f', label: 'Seraphina (DE)', lang: 'Deutsch', gender: 'Weiblich' },
+	{ key: 'de-m', label: 'Florian (DE)', lang: 'Deutsch', gender: 'Männlich' },
+	{ key: 'ch-f', label: 'Leni (CH)', lang: 'Schweizerdeutsch', gender: 'Weiblich' },
+	{ key: 'ch-m', label: 'Jan (CH)', lang: 'Schweizerdeutsch', gender: 'Männlich' },
+];
+
+const STORAGE_KEY = 'mana.interview.voice';
+const DEFAULT_VOICE: VoiceKey = 'de-f';
+
+export interface InterviewTtsHandle {
+	/** Whether audio is currently playing */
+	readonly speaking: boolean;
+	/** Always true — we have pre-rendered audio */
+	readonly isSupported: boolean;
+	/** Currently selected voice */
+	readonly voice: VoiceKey;
+	/** Set the voice */
+	setVoice: (key: VoiceKey) => void;
+	/** Play the audio for a question. Resolves when done. */
+	speak: (questionId: string, fallbackText?: string) => Promise<void>;
+	/** Stop playback immediately. */
+	stop: () => void;
+}
+
+export function useInterviewTts(): InterviewTtsHandle {
+	let speaking = $state(false);
+	let voice = $state<VoiceKey>(loadVoice());
+	let currentAudio: HTMLAudioElement | null = null;
+
+	function loadVoice(): VoiceKey {
+		if (typeof window === 'undefined') return DEFAULT_VOICE;
+		const stored = localStorage.getItem(STORAGE_KEY);
+		if (stored && VOICES.some((v) => v.key === stored)) return stored as VoiceKey;
+		return DEFAULT_VOICE;
+	}
+
+	function setVoice(key: VoiceKey) {
+		voice = key;
+		if (typeof window !== 'undefined') {
+			localStorage.setItem(STORAGE_KEY, key);
+		}
+	}
+
+	function speak(questionId: string, fallbackText?: string): Promise<void> {
+		stop();
+
+		const audioUrl = `/audio/interview/${voice}/${questionId}.mp3`;
+
+		return new Promise<void>((resolve) => {
+			const audio = new Audio(audioUrl);
+			currentAudio = audio;
+
+			audio.addEventListener(
+				'canplaythrough',
+				() => {
+					speaking = true;
+					audio.play().catch(() => {
+						// Autoplay blocked — try Web Speech API fallback
+						speaking = false;
+						if (fallbackText) {
+							speakFallback(fallbackText).then(resolve);
+						} else {
+							resolve();
+						}
+					});
+				},
+				{ once: true }
+			);
+
+			audio.addEventListener(
+				'ended',
+				() => {
+					speaking = false;
+					currentAudio = null;
+					resolve();
+				},
+				{ once: true }
+			);
+
+			audio.addEventListener(
+				'error',
+				() => {
+					// File not found — fallback to Web Speech API
+					speaking = false;
+					currentAudio = null;
+					if (fallbackText) {
+						speakFallback(fallbackText).then(resolve);
+					} else {
+						resolve();
+					}
+				},
+				{ once: true }
+			);
+
+			audio.load();
+		});
+	}
+
+	function stop() {
+		if (currentAudio) {
+			currentAudio.pause();
+			currentAudio.src = '';
+			currentAudio = null;
+		}
+		speaking = false;
+	}
+
+	return {
+		get speaking() {
+			return speaking;
+		},
+		get isSupported() {
+			return true;
+		},
+		get voice() {
+			return voice;
+		},
+		setVoice,
+		speak,
+		stop,
+	};
+}
+
+/** Web Speech API fallback for missing audio files. */
+function speakFallback(text: string): Promise<void> {
+	if (typeof window === 'undefined' || !('speechSynthesis' in window)) {
+		return Promise.resolve();
+	}
+
+	speechSynthesis.cancel();
+
+	return new Promise<void>((resolve) => {
+		const utterance = new SpeechSynthesisUtterance(text);
+		utterance.lang = 'de-DE';
+		utterance.rate = 0.92;
+		utterance.onend = () => resolve();
+		utterance.onerror = () => resolve();
+		speechSynthesis.speak(utterance);
+	});
+}
--- a/apps/mana/apps/web/static/audio/interview/ch-f/about.bio.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/about.bio.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/about.birthday.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/about.birthday.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/about.languages.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/about.languages.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/about.location.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/about.location.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/about.occupation.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/about.occupation.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/goals.current.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/goals.current.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/goals.focus.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/goals.focus.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/goals.learn.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/goals.learn.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/interests.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/interests.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/leisure.favoriteMedia.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/leisure.favoriteMedia.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/leisure.sports.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/leisure.sports.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/nutrition.allergies.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/nutrition.allergies.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/nutrition.diet.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/nutrition.diet.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/nutrition.preferences.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/nutrition.preferences.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/routine.bedtime.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/routine.bedtime.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/routine.wakeUp.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/routine.wakeUp.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/routine.workDays.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/routine.workDays.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/routine.workEnd.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/routine.workEnd.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/routine.workStart.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/routine.workStart.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/social.communication.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/social.communication.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/social.living.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/social.living.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/social.pets.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/social.pets.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-f/social.workStyle.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-f/social.workStyle.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/about.bio.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/about.bio.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/about.birthday.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/about.birthday.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/about.languages.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/about.languages.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/about.location.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/about.location.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/about.occupation.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/about.occupation.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/goals.current.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/goals.current.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/goals.focus.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/goals.focus.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/goals.learn.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/goals.learn.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/interests.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/interests.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/leisure.favoriteMedia.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/leisure.favoriteMedia.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/leisure.sports.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/leisure.sports.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/nutrition.allergies.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/nutrition.allergies.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/nutrition.diet.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/nutrition.diet.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/nutrition.preferences.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/nutrition.preferences.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/routine.bedtime.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/routine.bedtime.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/routine.wakeUp.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/routine.wakeUp.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/routine.workDays.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/routine.workDays.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/routine.workEnd.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/routine.workEnd.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/routine.workStart.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/routine.workStart.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/social.communication.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/social.communication.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/social.living.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/social.living.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/social.pets.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/social.pets.mp3
--- a/apps/mana/apps/web/static/audio/interview/ch-m/social.workStyle.mp3
+++ b/apps/mana/apps/web/static/audio/interview/ch-m/social.workStyle.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/about.bio.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/about.bio.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/about.birthday.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/about.birthday.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/about.languages.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/about.languages.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/about.location.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/about.location.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/about.occupation.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/about.occupation.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/goals.current.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/goals.current.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/goals.focus.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/goals.focus.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/goals.learn.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/goals.learn.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/interests.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/interests.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/leisure.favoriteMedia.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/leisure.favoriteMedia.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/leisure.sports.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/leisure.sports.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/nutrition.allergies.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/nutrition.allergies.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/nutrition.diet.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/nutrition.diet.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/nutrition.preferences.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/nutrition.preferences.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/routine.bedtime.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/routine.bedtime.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/routine.wakeUp.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/routine.wakeUp.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/routine.workDays.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/routine.workDays.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/routine.workEnd.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/routine.workEnd.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/routine.workStart.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/routine.workStart.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/social.communication.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/social.communication.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/social.living.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/social.living.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/social.pets.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/social.pets.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-f/social.workStyle.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-f/social.workStyle.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/about.bio.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/about.bio.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/about.birthday.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/about.birthday.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/about.languages.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/about.languages.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/about.location.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/about.location.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/about.occupation.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/about.occupation.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/goals.current.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/goals.current.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/goals.focus.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/goals.focus.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/goals.learn.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/goals.learn.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/interests.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/interests.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/leisure.favoriteMedia.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/leisure.favoriteMedia.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/leisure.sports.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/leisure.sports.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/nutrition.allergies.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/nutrition.allergies.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/nutrition.diet.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/nutrition.diet.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/nutrition.preferences.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/nutrition.preferences.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/routine.bedtime.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/routine.bedtime.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/routine.wakeUp.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/routine.wakeUp.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/routine.workDays.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/routine.workDays.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/routine.workEnd.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/routine.workEnd.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/routine.workStart.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/routine.workStart.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/social.communication.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/social.communication.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/social.living.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/social.living.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/social.pets.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/social.pets.mp3
--- a/apps/mana/apps/web/static/audio/interview/de-m/social.workStyle.mp3
+++ b/apps/mana/apps/web/static/audio/interview/de-m/social.workStyle.mp3
--- a/services/mana-tts/CLAUDE.md
+++ b/services/mana-tts/CLAUDE.md
@ -16,6 +16,8 @@ Text-to-Speech microservice. Wraps Kokoro (English presets), Piper (German, loca
 | **Framework** | FastAPI |
 | **English (preset)** | Kokoro-82M (`kokoro_service.py`) |
 | **German (local)** | Piper ONNX with `kerstin_low.onnx` and `thorsten_medium.onnx` voices (`piper_service.py`) |
+| **German (high-quality)** | Orpheus-3B German finetune (`orpheus_service.py`) — best for pre-generation |
+| **Multilingual (expressive)** | Zonos v0.1 by Zyphra (`zonos_service.py`) — emotion control, 200k hours training |
 | **Voice cloning** | F5-TTS on CUDA (`f5_service.py`) |
 | **Audio I/O** | `soundfile`, `pydub` |
 | **Auth** | Per-key + internal-key API auth (`auth.py`) + JWT via mana-auth (`external_auth.py`) |
@ -43,6 +45,8 @@ Public URL: `https://gpu-tts.mana.how`.
 | DELETE | `/voices/{voice_id}` | Delete a custom voice |
 | POST | `/synthesize/kokoro` | Kokoro synthesis (English presets) |
 | POST | `/synthesize` | F5-TTS voice cloning |
+| POST | `/synthesize/orpheus` | Orpheus synthesis (German, high-quality, pre-generation) |
+| POST | `/synthesize/zonos` | Zonos synthesis (multilingual, expressive, emotion control) |
 | POST | `/synthesize/auto` | Routing helper — picks the right backend for the requested voice |

 All non-health endpoints require `Authorization: Bearer <token>` (per-app key, internal key, or mana-auth JWT).
@ -59,6 +63,12 @@ All non-health endpoints require `Authorization: Bearer <token>` (per-app key, i

 Fallback to Edge TTS cloud voices if Piper isn't loaded.

+### Orpheus-3B German (high-quality pre-generation)
+~8 GB VRAM. German finetune (`Kartoffel/Orpheus-3B_german_natural-v0.1`). Natural intonation, built-in speaker voices (tara, leo, emma, ...). Best quality for pre-generating static audio files. Not real-time.
+
+### Zonos v0.1 (expressive multilingual)
+~5 GB VRAM. By Zyphra, trained on 200k hours. Explicit German support. Fine-grained control: emotion (neutral/friendly/warm/curious), speaking rate, pitch variation. Can clone voices from 5s reference audio.
+
 ### F5-TTS (voice cloning)
 ~6 GB. Requires reference audio + transcript. Higher quality, slower. Custom voices live in `voices/` (reference audio + transcript per voice ID).

@ -84,6 +94,8 @@ services/mana-tts/
 │   ├── kokoro_service.py   # Kokoro (English presets)
 │   ├── piper_service.py    # Piper (German, local ONNX)
 │   ├── f5_service.py       # F5-TTS (voice cloning, CUDA)
+│   ├── orpheus_service.py  # Orpheus-3B German (high-quality)
+│   ├── zonos_service.py    # Zonos v0.1 (expressive multilingual)
 │   ├── voice_manager.py    # Custom voice registry
 │   ├── audio_utils.py      # Format conversion, resampling
 │   ├── auth.py             # API-key auth
--- a/services/mana-tts/app/main.py
+++ b/services/mana-tts/app/main.py
@ -42,6 +42,17 @@ from .piper_service import (
    PIPER_VOICES,
    is_piper_loaded,
 )
+from .orpheus_service import (
+    synthesize_orpheus,
+    is_orpheus_loaded,
+    ORPHEUS_VOICES,
+    DEFAULT_VOICE as DEFAULT_ORPHEUS_VOICE,
+)
+from .zonos_service import (
+    synthesize_zonos,
+    is_zonos_loaded,
+    EMOTION_PRESETS as ZONOS_EMOTIONS,
+)

 # Configure logging
 logging.basicConfig(
@ -203,6 +214,8 @@ async def health_check():
        models_loaded={
            "kokoro": is_kokoro_loaded(),
            "f5": is_f5_loaded(),
+            "orpheus": is_orpheus_loaded(),
+            "zonos": is_zonos_loaded(),
        },
        auth_required=REQUIRE_AUTH,
    )
@ -528,6 +541,160 @@ async def synthesize_with_f5(
            cleanup_temp_file(temp_file_path)


+# ============================================================================
+# Orpheus TTS Endpoint (German, high-quality)
+# ============================================================================
+
+
+class OrpheusRequest(BaseModel):
+    """Request for Orpheus TTS synthesis."""
+
+    text: str = Field(..., description="Text to synthesize (German)", max_length=5000)
+    voice: str = Field(DEFAULT_ORPHEUS_VOICE, description="Speaker voice")
+    output_format: str = Field("wav", description="Output format (wav, mp3)")
+    temperature: float = Field(0.6, ge=0.1, le=1.5, description="Sampling temperature")
+
+
+@app.post("/synthesize/orpheus")
+async def synthesize_with_orpheus(
+    request: OrpheusRequest,
+    auth: AuthResult = Depends(verify_api_key),
+):
+    """
+    Synthesize German speech using Orpheus TTS.
+
+    High-quality German synthesis with natural intonation.
+    Not optimized for real-time — designed for pre-generation.
+    """
+    if not request.text.strip():
+        raise HTTPException(status_code=400, detail="Text cannot be empty")
+
+    if len(request.text) > MAX_TEXT_LENGTH:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Text exceeds maximum length of {MAX_TEXT_LENGTH} characters",
+        )
+
+    output_format = request.output_format.lower()
+    if output_format not in SUPPORTED_FORMATS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported format. Use one of: {SUPPORTED_FORMATS}",
+        )
+
+    try:
+        result = await synthesize_orpheus(
+            text=request.text,
+            voice=request.voice,
+            temperature=request.temperature,
+        )
+
+        audio_bytes, content_type = convert_audio(
+            result.audio,
+            result.sample_rate,
+            output_format,
+        )
+
+        return Response(
+            content=audio_bytes,
+            media_type=content_type,
+            headers={
+                "X-Model": "orpheus-german",
+                "X-Voice": result.voice,
+                "X-Duration": str(result.duration),
+                "X-Sample-Rate": str(result.sample_rate),
+            },
+        )
+
+    except RuntimeError as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        logger.error(f"Orpheus synthesis error: {e}")
+        raise HTTPException(status_code=500, detail=f"Orpheus synthesis failed: {e}")
+
+
+# ============================================================================
+# Zonos TTS Endpoint (Multilingual, expressive)
+# ============================================================================
+
+
+class ZonosRequest(BaseModel):
+    """Request for Zonos TTS synthesis."""
+
+    text: str = Field(..., description="Text to synthesize", max_length=5000)
+    language: str = Field("de", description="Language code")
+    emotion: str = Field("friendly", description="Emotion preset: neutral, friendly, warm, curious")
+    speaking_rate: float = Field(13.0, ge=5.0, le=25.0, description="Phonemes per second")
+    pitch_std: float = Field(20.0, ge=5.0, le=50.0, description="Pitch variation in Hz")
+    output_format: str = Field("wav", description="Output format (wav, mp3)")
+
+
+@app.post("/synthesize/zonos")
+async def synthesize_with_zonos(
+    request: ZonosRequest,
+    auth: AuthResult = Depends(verify_api_key),
+):
+    """
+    Synthesize speech using Zonos TTS by Zyphra.
+
+    Expressive multilingual synthesis with emotion control.
+    Trained on 200k hours — explicit German support.
+    """
+    if not request.text.strip():
+        raise HTTPException(status_code=400, detail="Text cannot be empty")
+
+    if len(request.text) > MAX_TEXT_LENGTH:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Text exceeds maximum length of {MAX_TEXT_LENGTH} characters",
+        )
+
+    output_format = request.output_format.lower()
+    if output_format not in SUPPORTED_FORMATS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported format. Use one of: {SUPPORTED_FORMATS}",
+        )
+
+    if request.emotion not in ZONOS_EMOTIONS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unknown emotion. Use one of: {list(ZONOS_EMOTIONS.keys())}",
+        )
+
+    try:
+        result = await synthesize_zonos(
+            text=request.text,
+            language=request.language,
+            emotion=request.emotion,
+            speaking_rate=request.speaking_rate,
+            pitch_std=request.pitch_std,
+        )
+
+        audio_bytes, content_type = convert_audio(
+            result.audio,
+            result.sample_rate,
+            output_format,
+        )
+
+        return Response(
+            content=audio_bytes,
+            media_type=content_type,
+            headers={
+                "X-Model": "zonos-v0.1",
+                "X-Emotion": result.emotion,
+                "X-Duration": str(result.duration),
+                "X-Sample-Rate": str(result.sample_rate),
+            },
+        )
+
+    except RuntimeError as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        logger.error(f"Zonos synthesis error: {e}")
+        raise HTTPException(status_code=500, detail=f"Zonos synthesis failed: {e}")
+
+
 # ============================================================================
 # Auto-Selection Endpoint
 # ============================================================================
--- a/services/mana-tts/app/orpheus_service.py
+++ b/services/mana-tts/app/orpheus_service.py
@ -0,0 +1,229 @@
+"""
+Orpheus TTS — High-quality German speech synthesis.
+
+Uses the Orpheus-TTS model with German finetune for natural-sounding
+interview question generation. Not optimized for real-time — quality first.
+
+Model: Kartoffel_Orpheus-3B_german_natural-v0.1 (HuggingFace)
+VRAM: ~8 GB (fits comfortably on RTX 3090 alongside other models)
+"""
+
+import logging
+import asyncio
+from dataclasses import dataclass
+from typing import Optional
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# Lazy-loaded model state
+_model = None
+_tokenizer = None
+_loaded = False
+
+MODEL_ID = "Vishalshendge3198/orpheus-3b-tts-german-emotional-merged"
+SAMPLE_RATE = 24000
+
+# Available voices (Orpheus built-in speaker tags)
+ORPHEUS_VOICES = {
+    "tara": "Female, warm and clear (default)",
+    "leah": "Female, soft and friendly",
+    "jess": "Female, energetic",
+    "leo": "Male, calm and professional",
+    "dan": "Male, deep and warm",
+    "mia": "Female, young and bright",
+    "zac": "Male, confident",
+    "emma": "Female, neutral",
+}
+
+DEFAULT_VOICE = "tara"
+
+
+@dataclass
+class OrpheusResult:
+    audio: np.ndarray
+    sample_rate: int
+    duration: float
+    voice: str
+
+
+def is_orpheus_loaded() -> bool:
+    return _loaded
+
+
+def get_orpheus_model():
+    """Load the Orpheus German model (lazy, first call only)."""
+    global _model, _tokenizer, _loaded
+
+    if _loaded:
+        return _model, _tokenizer
+
+    logger.info(f"Loading Orpheus German model: {MODEL_ID}")
+
+    try:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        import torch
+
+        _tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_ID,
+            trust_remote_code=True,
+        )
+        _model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.bfloat16,
+            device_map="cuda",
+            trust_remote_code=True,
+        )
+        _model.eval()
+        _loaded = True
+        logger.info("Orpheus German model loaded successfully")
+        return _model, _tokenizer
+
+    except Exception as e:
+        logger.error(f"Failed to load Orpheus model: {e}")
+        raise RuntimeError(f"Failed to load Orpheus model: {e}")
+
+
+def unload_orpheus():
+    """Free VRAM by unloading the model."""
+    global _model, _tokenizer, _loaded
+    import torch
+
+    if _model is not None:
+        del _model
+        _model = None
+    if _tokenizer is not None:
+        del _tokenizer
+        _tokenizer = None
+    _loaded = False
+    torch.cuda.empty_cache()
+    logger.info("Orpheus model unloaded")
+
+
+async def synthesize_orpheus(
+    text: str,
+    voice: str = DEFAULT_VOICE,
+    temperature: float = 0.6,
+    top_p: float = 0.95,
+    max_new_tokens: int = 4096,
+) -> OrpheusResult:
+    """
+    Synthesize German speech using Orpheus TTS.
+
+    Returns OrpheusResult with audio as numpy float32 array.
+    """
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(
+        None,
+        _synthesize_sync,
+        text,
+        voice,
+        temperature,
+        top_p,
+        max_new_tokens,
+    )
+
+
+def _synthesize_sync(
+    text: str,
+    voice: str,
+    temperature: float,
+    top_p: float,
+    max_new_tokens: int,
+) -> OrpheusResult:
+    """Synchronous synthesis (runs in thread pool)."""
+    import torch
+
+    model, tokenizer = get_orpheus_model()
+
+    # Orpheus uses a specific prompt format with speaker tags
+    prompt = f"<|speaker:{voice}|>{text}"
+
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+        )
+
+    # Extract audio tokens (model-specific decoding)
+    audio_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+
+    # Decode audio tokens to waveform
+    # Orpheus uses a SNAC-based codec — tokens map to audio via the model's decode method
+    if hasattr(model, "decode_audio"):
+        audio_np = model.decode_audio(audio_tokens).cpu().numpy().flatten()
+    else:
+        # Fallback: use the tokenizer's decode if model doesn't have decode_audio
+        # This handles different Orpheus model versions
+        audio_np = _decode_orpheus_tokens(audio_tokens, model)
+
+    duration = len(audio_np) / SAMPLE_RATE
+
+    return OrpheusResult(
+        audio=audio_np,
+        sample_rate=SAMPLE_RATE,
+        duration=duration,
+        voice=voice,
+    )
+
+
+def _decode_orpheus_tokens(tokens, model) -> np.ndarray:
+    """
+    Decode Orpheus audio tokens using SNAC codec.
+
+    Orpheus generates special audio tokens that need to be decoded
+    through the SNAC vocoder to produce the final waveform.
+    """
+    import torch
+
+    try:
+        from snac import SNAC
+
+        snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(model.device)
+
+        # Filter to audio-only tokens (above text vocab range)
+        audio_token_ids = tokens[tokens >= 128256].tolist()
+
+        if not audio_token_ids:
+            logger.warning("No audio tokens generated")
+            return np.zeros(SAMPLE_RATE, dtype=np.float32)  # 1s silence
+
+        # Orpheus interleaves 3 codebook levels: [c1, c2, c3, c1, c2, c3, ...]
+        # Redistribute into separate codebook tensors
+        codes_0, codes_1, codes_2 = [], [], []
+        for i, token_id in enumerate(audio_token_ids):
+            # Offset tokens back to codebook range
+            code = token_id - 128256
+            level = i % 3
+            if level == 0:
+                codes_0.append(code)
+            elif level == 1:
+                codes_1.append(code)
+            else:
+                codes_2.append(code)
+
+        # Trim to equal lengths
+        min_len = min(len(codes_0), len(codes_1), len(codes_2))
+        if min_len == 0:
+            return np.zeros(SAMPLE_RATE, dtype=np.float32)
+
+        codes = [
+            torch.tensor(codes_0[:min_len], device=model.device).unsqueeze(0),
+            torch.tensor(codes_1[:min_len], device=model.device).unsqueeze(0),
+            torch.tensor(codes_2[:min_len], device=model.device).unsqueeze(0),
+        ]
+
+        with torch.no_grad():
+            audio = snac.decode(codes).squeeze().cpu().numpy()
+
+        return audio.astype(np.float32)
+
+    except ImportError:
+        logger.error("snac package not installed — pip install snac")
+        raise RuntimeError("snac package required for Orpheus audio decoding")
--- a/services/mana-tts/app/zonos_service.py
+++ b/services/mana-tts/app/zonos_service.py
@ -0,0 +1,205 @@
+"""
+Zonos TTS — Expressive multilingual speech synthesis by Zyphra.
+
+Trained on 200k hours of speech data with explicit German support.
+Fine-grained control over pitch, speaking rate, and emotions.
+
+Model: Zyphra/Zonos-v0.1-transformer (HuggingFace)
+VRAM: ~5 GB (fits comfortably on RTX 3090)
+"""
+
+import logging
+import asyncio
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+import numpy as np
+
+# Disable torch.compile (requires MSVC cl.exe on Windows which we don't have)
+os.environ["TORCHDYNAMO_DISABLE"] = "1"
+
+logger = logging.getLogger(__name__)
+
+# Lazy-loaded model state
+_model = None
+_loaded = False
+
+MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
+SAMPLE_RATE = 44100  # Zonos outputs 44.1 kHz audio
+
+# Emotion presets for the interview context
+EMOTION_PRESETS = {
+    "neutral": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],  # neutral dominant
+    "friendly": [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5],  # happiness + neutral
+    "warm": [0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7],  # slight warmth
+    "curious": [0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7],  # interested
+}
+
+DEFAULT_EMOTION = "friendly"
+
+
+@dataclass
+class ZonosResult:
+    audio: np.ndarray
+    sample_rate: int
+    duration: float
+    emotion: str
+
+
+def is_zonos_loaded() -> bool:
+    return _loaded
+
+
+def get_zonos_model():
+    """Load the Zonos model (lazy, first call only)."""
+    global _model, _loaded
+
+    if _loaded:
+        return _model
+
+    logger.info(f"Loading Zonos model: {MODEL_ID}")
+
+    try:
+        import torch
+
+        # Zonos provides its own loader
+        # Try the official zonos package first, fall back to transformers
+        try:
+            from zonos.model import Zonos
+
+            _model = Zonos.from_pretrained(MODEL_ID, device="cuda")
+        except ImportError:
+            # If zonos package not installed, use transformers
+            logger.info("zonos package not found, trying transformers loading")
+            from transformers import AutoModel
+
+            _model = AutoModel.from_pretrained(
+                MODEL_ID,
+                torch_dtype=torch.float32,
+                trust_remote_code=True,
+            ).to("cuda")
+
+        _loaded = True
+        logger.info("Zonos model loaded successfully")
+        return _model
+
+    except Exception as e:
+        logger.error(f"Failed to load Zonos model: {e}")
+        raise RuntimeError(f"Failed to load Zonos model: {e}")
+
+
+def unload_zonos():
+    """Free VRAM by unloading the model."""
+    global _model, _loaded
+    import torch
+
+    if _model is not None:
+        del _model
+        _model = None
+    _loaded = False
+    torch.cuda.empty_cache()
+    logger.info("Zonos model unloaded")
+
+
+async def synthesize_zonos(
+    text: str,
+    language: str = "de",
+    emotion: str = DEFAULT_EMOTION,
+    speaking_rate: float = 13.0,
+    pitch_std: float = 20.0,
+    speaker_audio: Optional[bytes] = None,
+) -> ZonosResult:
+    """
+    Synthesize speech using Zonos TTS.
+
+    Args:
+        text: Text to synthesize
+        language: Language code (default: 'de' for German)
+        emotion: Emotion preset name or custom emotion vector
+        speaking_rate: Speaking rate in phonemes/sec (default 13.0, range ~8-20)
+        pitch_std: Pitch variation in Hz (default 20.0, range ~5-50)
+        speaker_audio: Optional reference audio bytes for voice cloning
+
+    Returns ZonosResult with audio as numpy float32 array.
+    """
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(
+        None,
+        _synthesize_sync,
+        text,
+        language,
+        emotion,
+        speaking_rate,
+        pitch_std,
+        speaker_audio,
+    )
+
+
+def _synthesize_sync(
+    text: str,
+    language: str,
+    emotion: str,
+    speaking_rate: float,
+    pitch_std: float,
+    speaker_audio: Optional[bytes],
+) -> ZonosResult:
+    """Synchronous synthesis (runs in thread pool)."""
+    import torch
+    from zonos.conditioning import make_cond_dict
+
+    model = get_zonos_model()
+
+    # Resolve emotion preset
+    emotion_values = EMOTION_PRESETS.get(emotion, EMOTION_PRESETS["friendly"])
+
+    # Build speaker embedding if reference audio provided
+    speaker_embedding = None
+    if speaker_audio:
+        speaker_embedding = _embed_speaker(speaker_audio, model)
+
+    # Map language codes: Zonos expects espeak language codes like 'de' or 'en-us'
+    lang_map = {"de": "de", "en": "en-us", "fr": "fr-fr", "es": "es", "it": "it"}
+    espeak_lang = lang_map.get(language, language)
+
+    # Build conditioning using Zonos's own helper
+    cond = make_cond_dict(
+        text=text,
+        language=espeak_lang,
+        emotion=emotion_values,
+        speaking_rate=speaking_rate,
+        pitch_std=pitch_std,
+        speaker=speaker_embedding,
+    )
+
+    # Generate
+    with torch.no_grad():
+        conditioning = model.prepare_conditioning(cond)
+        codes = model.generate(conditioning)
+        audio = model.autoencoder.decode(codes).squeeze().cpu().numpy()
+
+    audio = audio.astype(np.float32)
+    duration = len(audio) / SAMPLE_RATE
+
+    return ZonosResult(
+        audio=audio,
+        sample_rate=SAMPLE_RATE,
+        duration=duration,
+        emotion=emotion,
+    )
+
+
+def _embed_speaker(audio_bytes: bytes, model) -> "torch.Tensor":
+    """Create speaker embedding from reference audio bytes."""
+    import torch
+    import io
+    import soundfile as sf
+
+    audio_data, sr = sf.read(io.BytesIO(audio_bytes))
+
+    if len(audio_data.shape) > 1:
+        audio_data = audio_data.mean(axis=1)  # mono
+
+    audio_tensor = torch.tensor(audio_data, dtype=torch.float32, device="cuda").unsqueeze(0)
+
+    return model.make_speaker_embedding(audio_tensor, sr)
--- a/services/mana-tts/requirements.txt
+++ b/services/mana-tts/requirements.txt
@ -23,3 +23,13 @@ aiofiles>=24.1.0

 # External Auth (mana-core-auth integration)
 httpx>=0.27.0
+
+# ── Orpheus TTS (German high-quality) ──
+# Uses transformers + SNAC codec for audio decoding
+transformers>=4.44.0
+snac>=1.2.0
+torch>=2.1.0
+
+# ── Zonos TTS (expressive multilingual by Zyphra) ──
+# Install via: pip install git+https://github.com/Zyphra/Zonos.git
+# (the 'zonos' package pulls its own deps including torch, encodec, etc.)
--- a/Show more
+++ b/Show more