From a2233dc36645f682b36ebc63934e88ee39080d76 Mon Sep 17 00:00:00 2001 From: Till-JS <101404291+Till-JS@users.noreply.github.com> Date: Tue, 27 Jan 2026 02:13:34 +0100 Subject: [PATCH] fix(stt): properly encode audio as base64 for Voxtral --- services/mana-stt/app/voxtral_service.py | 49 +++++++++++------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/services/mana-stt/app/voxtral_service.py b/services/mana-stt/app/voxtral_service.py index 00ab0d737..fa7909a3d 100644 --- a/services/mana-stt/app/voxtral_service.py +++ b/services/mana-stt/app/voxtral_service.py @@ -106,29 +106,28 @@ def transcribe_audio( VoxtralTranscriptionResult with transcribed text """ import torch - import soundfile as sf - import numpy as np model, processor = get_voxtral_model(model_name) logger.info(f"Transcribing with Voxtral: {audio_path}") try: - # Load audio - audio_array, sample_rate = sf.read(audio_path) + # Load audio file as bytes and encode to base64 + with open(audio_path, "rb") as f: + audio_bytes = f.read() + audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") - # Convert stereo to mono if needed - if len(audio_array.shape) > 1: - audio_array = np.mean(audio_array, axis=1) - - # Resample to 24kHz (Voxtral's expected sample rate) - target_sr = 24000 - if sample_rate != target_sr: - from scipy import signal - - num_samples = int(len(audio_array) * target_sr / sample_rate) - audio_array = signal.resample(audio_array, num_samples) - sample_rate = target_sr + # Determine audio format from extension + ext = Path(audio_path).suffix.lower() + mime_types = { + ".wav": "audio/wav", + ".mp3": "audio/mpeg", + ".m4a": "audio/m4a", + ".flac": "audio/flac", + ".ogg": "audio/ogg", + ".webm": "audio/webm", + } + mime_type = mime_types.get(ext, "audio/wav") # Language mapping for prompts lang_names = { @@ -143,26 +142,24 @@ def transcribe_audio( } lang_name = lang_names.get(language, "German") - # Create transcription prompt + # Create transcription prompt with base64 audio messages = [ { "role": "user", "content": [ - {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,PLACEHOLDER"}}, + {"type": "audio_url", "audio_url": {"url": f"data:{mime_type};base64,{audio_base64}"}}, {"type": "text", "text": f"Transcribe this audio in {lang_name}. Only output the transcription, nothing else."}, ], } ] # Apply chat template and process inputs - prompt = processor.apply_chat_template(messages, tokenize=False) - - # Process audio with the processor - inputs = processor( - text=prompt, - audios=[audio_array], - sampling_rate=sample_rate, + inputs = processor.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, return_tensors="pt", + return_dict=True, ) # Move to same device as model @@ -178,7 +175,7 @@ def transcribe_audio( ) # Decode only the generated tokens (exclude input) - input_len = inputs.get("input_ids", inputs.get("input_features")).shape[-1] + input_len = inputs["input_ids"].shape[-1] text = processor.batch_decode( generated_ids[:, input_len:], skip_special_tokens=True,