From 7c9c2645e35620756e8a6270068c2b32c399c221 Mon Sep 17 00:00:00 2001
From: Till-JS <101404291+Till-JS@users.noreply.github.com>
Date: Wed, 11 Feb 2026 16:14:14 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20fix(mana-stt):=20adjust=20vLLM?=
 =?UTF-8?q?=20config=20for=20CPU=20mode?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Reduce max-model-len to 4096 for CPU compatibility
- Add max-num-batched-tokens matching the context size
- Add enforce-eager for stable CPU inference

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 services/mana-stt/scripts/start-vllm-voxtral.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/services/mana-stt/scripts/start-vllm-voxtral.sh b/services/mana-stt/scripts/start-vllm-voxtral.sh
index 280ba1970..70259d59a 100755
--- a/services/mana-stt/scripts/start-vllm-voxtral.sh
+++ b/services/mana-stt/scripts/start-vllm-voxtral.sh
@@ -23,14 +23,19 @@ if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then
     exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
         --host 0.0.0.0 \
         --port "$PORT" \
-        --max-model-len 8192
+        --max-model-len 4096 \
+        --max-num-batched-tokens 4096 \
+        --enforce-eager
 else
     echo "Model: Voxtral Mini 3B"
+    # CPU mode needs smaller context and batched tokens
     exec vllm serve mistralai/Voxtral-Mini-3B-2507 \
         --tokenizer_mode mistral \
         --config_format mistral \
         --load_format mistral \
         --host 0.0.0.0 \
         --port "$PORT" \
-        --max-model-len 32768
+        --max-model-len 4096 \
+        --max-num-batched-tokens 4096 \
+        --enforce-eager
 fi