mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-17 18:49:39 +02:00
The Windows GPU server has been the actual production home for these
services for some time, and the running code there has drifted ahead of
the repo. This sync pulls the live versions back into the repo so the
Windows box is no longer the only place those changes exist.
Pulled from C:\mana\services\* on mana-server-gpu (192.168.178.11):
mana-llm:
- src/main.py, src/config.py — small fixes (auth wiring, config tweaks)
- src/api_auth.py — NEW (cross-service GPU_API_KEY validator)
- service.pyw — Windows runner used by the ManaLLM scheduled task
(sets up logging redirect, loads .env, calls uvicorn)
mana-stt:
- app/main.py — substantial cleanup (684→392 lines), drops the
whisperx-as-separate-backend branching now that whisper_service.py
rolls whisperx in directly
- app/whisper_service.py — full CUDA + whisperx rewrite (158→358 lines)
- app/auth.py + external_auth.py — significantly expanded auth
- app/vram_manager.py — NEW (shared VRAM accounting helper)
- service.pyw — Windows runner with CUDA pre-init, FFmpeg PATH
injection, .env loading
- removed: app/whisper_service_cuda.py (folded into whisper_service.py)
- removed: app/whisperx_service.py (folded into whisper_service.py)
mana-tts:
- app/auth.py, external_auth.py — same auth expansion as stt
- app/f5_service.py, kokoro_service.py — Windows tweaks
- app/vram_manager.py — NEW (same shared helper as stt)
- service.pyw — Windows runner
mana-video-gen:
- service.pyw — Windows runner (no other changes; the .py code on the
GPU box is byte-identical to what's already in the repo)
The service.pyw files contain absolute Windows paths
(C:\mana\services\<svc>) and a hardcoded FFmpeg PATH for the tills user
profile. Kept as-is intentionally — they exist to be deployed to that
one machine and any abstraction layer would just hide what's actually
happening. Anyone redeploying to a different layout will need to edit
the path strings, which is a known and obvious change.
Mac-Mini infrastructure for these services (launchd plists, install
scripts, scripts/mac-mini/setup-{stt,tts}.sh, the Mac-flux2c image-gen
implementation) is still on disk and will be removed in a follow-up
commit, along with replacing mana-image-gen with the Windows
diffusers+CUDA implementation. This commit is just the live-code sync.
165 lines
5.2 KiB
Python
165 lines
5.2 KiB
Python
"""
|
|
Kokoro TTS Service for fast preset voice synthesis.
|
|
CUDA version using kokoro PyTorch package.
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Global singleton for lazy initialization
|
|
_kokoro_pipeline = None
|
|
|
|
# Default model
|
|
DEFAULT_KOKORO_MODEL = "hexgrad/Kokoro-82M"
|
|
|
|
# Available Kokoro voices (American Female/Male, British Female/Male)
|
|
KOKORO_VOICES = {
|
|
# American Female voices
|
|
"af_heart": "American Female - Heart (warm, emotional)",
|
|
"af_alloy": "American Female - Alloy (neutral, professional)",
|
|
"af_aoede": "American Female - Aoede (clear, articulate)",
|
|
"af_bella": "American Female - Bella (friendly, approachable)",
|
|
"af_jessica": "American Female - Jessica (confident, clear)",
|
|
"af_kore": "American Female - Kore (calm, measured)",
|
|
"af_nicole": "American Female - Nicole (bright, energetic)",
|
|
"af_nova": "American Female - Nova (modern, dynamic)",
|
|
"af_river": "American Female - River (smooth, flowing)",
|
|
"af_sarah": "American Female - Sarah (warm, conversational)",
|
|
"af_sky": "American Female - Sky (light, airy)",
|
|
# American Male voices
|
|
"am_adam": "American Male - Adam (deep, authoritative)",
|
|
"am_echo": "American Male - Echo (resonant, clear)",
|
|
"am_eric": "American Male - Eric (professional, neutral)",
|
|
"am_fenrir": "American Male - Fenrir (strong, commanding)",
|
|
"am_liam": "American Male - Liam (friendly, casual)",
|
|
"am_michael": "American Male - Michael (warm, trustworthy)",
|
|
"am_onyx": "American Male - Onyx (deep, smooth)",
|
|
"am_puck": "American Male - Puck (playful, light)",
|
|
# British Female voices
|
|
"bf_alice": "British Female - Alice (refined, elegant)",
|
|
"bf_emma": "British Female - Emma (clear, professional)",
|
|
"bf_isabella": "British Female - Isabella (sophisticated, warm)",
|
|
"bf_lily": "British Female - Lily (soft, gentle)",
|
|
# British Male voices
|
|
"bm_daniel": "British Male - Daniel (classic, authoritative)",
|
|
"bm_fable": "British Male - Fable (storyteller, expressive)",
|
|
"bm_george": "British Male - George (traditional, clear)",
|
|
"bm_lewis": "British Male - Lewis (modern, approachable)",
|
|
}
|
|
|
|
DEFAULT_VOICE = "af_heart"
|
|
|
|
|
|
@dataclass
|
|
class KokoroResult:
|
|
"""Result from Kokoro TTS synthesis."""
|
|
|
|
audio: np.ndarray
|
|
sample_rate: int
|
|
voice: str
|
|
duration: float
|
|
|
|
|
|
def get_kokoro_model(model_name: str = DEFAULT_KOKORO_MODEL):
|
|
"""Get or create Kokoro pipeline instance (singleton pattern)."""
|
|
global _kokoro_pipeline
|
|
|
|
if _kokoro_pipeline is not None:
|
|
return _kokoro_pipeline
|
|
|
|
logger.info(f"Loading Kokoro model: {model_name}")
|
|
|
|
try:
|
|
from kokoro import KPipeline
|
|
|
|
_kokoro_pipeline = KPipeline(lang_code="a") # 'a' for American English
|
|
logger.info("Kokoro pipeline loaded successfully")
|
|
return _kokoro_pipeline
|
|
|
|
except ImportError as e:
|
|
logger.error(f"Failed to import kokoro: {e}")
|
|
raise RuntimeError(
|
|
"kokoro not installed. Run: pip install kokoro"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to load Kokoro model: {e}")
|
|
raise
|
|
|
|
|
|
def is_kokoro_loaded() -> bool:
|
|
"""Check if Kokoro model is currently loaded."""
|
|
return _kokoro_pipeline is not None
|
|
|
|
|
|
def get_available_voices() -> dict[str, str]:
|
|
"""Get dictionary of available Kokoro voices."""
|
|
return KOKORO_VOICES.copy()
|
|
|
|
|
|
async def synthesize_kokoro(
|
|
text: str,
|
|
voice: str = DEFAULT_VOICE,
|
|
speed: float = 1.0,
|
|
model_name: str = DEFAULT_KOKORO_MODEL,
|
|
) -> KokoroResult:
|
|
"""
|
|
Synthesize speech using Kokoro TTS.
|
|
|
|
Args:
|
|
text: Text to synthesize
|
|
voice: Voice ID from KOKORO_VOICES
|
|
speed: Speech speed multiplier (0.5-2.0)
|
|
model_name: Model identifier
|
|
|
|
Returns:
|
|
KokoroResult with audio data
|
|
"""
|
|
# Validate voice
|
|
if voice not in KOKORO_VOICES:
|
|
logger.warning(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
|
|
voice = DEFAULT_VOICE
|
|
|
|
# Clamp speed to valid range
|
|
speed = max(0.5, min(2.0, speed))
|
|
|
|
# Get model
|
|
pipeline = get_kokoro_model(model_name)
|
|
|
|
logger.info(f"Synthesizing with Kokoro: voice={voice}, speed={speed}, text_length={len(text)}")
|
|
|
|
try:
|
|
# Generate audio using kokoro pipeline
|
|
audio_chunks = []
|
|
sample_rate = 24000 # Kokoro default
|
|
|
|
for result in pipeline(text, voice=voice, speed=speed):
|
|
# result is a KPipelineResult with .audio (tensor) and .graphemes/.phonemes
|
|
audio_np = result.audio.numpy()
|
|
audio_chunks.append(audio_np)
|
|
|
|
# Concatenate all chunks
|
|
if audio_chunks:
|
|
full_audio = np.concatenate(audio_chunks)
|
|
else:
|
|
raise RuntimeError("No audio generated")
|
|
|
|
# Calculate duration from audio length
|
|
total_duration = len(full_audio) / sample_rate
|
|
|
|
logger.info(f"Kokoro synthesis complete: duration={total_duration:.2f}s")
|
|
|
|
return KokoroResult(
|
|
audio=full_audio,
|
|
sample_rate=sample_rate,
|
|
voice=voice,
|
|
duration=total_duration,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Kokoro synthesis failed: {e}")
|
|
raise RuntimeError(f"TTS synthesis failed: {e}")
|