From 7c9c2645e35620756e8a6270068c2b32c399c221 Mon Sep 17 00:00:00 2001 From: Till-JS <101404291+Till-JS@users.noreply.github.com> Date: Wed, 11 Feb 2026 16:14:14 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20fix(mana-stt):=20adjust=20vLLM?= =?UTF-8?q?=20config=20for=20CPU=20mode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reduce max-model-len to 4096 for CPU compatibility - Add max-num-batched-tokens matching the context size - Add enforce-eager for stable CPU inference Co-Authored-By: Claude Opus 4.5 --- services/mana-stt/scripts/start-vllm-voxtral.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/services/mana-stt/scripts/start-vllm-voxtral.sh b/services/mana-stt/scripts/start-vllm-voxtral.sh index 280ba1970..70259d59a 100755 --- a/services/mana-stt/scripts/start-vllm-voxtral.sh +++ b/services/mana-stt/scripts/start-vllm-voxtral.sh @@ -23,14 +23,19 @@ if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \ --host 0.0.0.0 \ --port "$PORT" \ - --max-model-len 8192 + --max-model-len 4096 \ + --max-num-batched-tokens 4096 \ + --enforce-eager else echo "Model: Voxtral Mini 3B" + # CPU mode needs smaller context and batched tokens exec vllm serve mistralai/Voxtral-Mini-3B-2507 \ --tokenizer_mode mistral \ --config_format mistral \ --load_format mistral \ --host 0.0.0.0 \ --port "$PORT" \ - --max-model-len 32768 + --max-model-len 4096 \ + --max-num-batched-tokens 4096 \ + --enforce-eager fi