🐛 fix(mana-stt): adjust vLLM config for CPU mode

- Reduce max-model-len to 4096 for CPU compatibility - Add max-num-batched-tokens matching the context size - Add enforce-eager for stable CPU inference Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-14 21:41:09 +02:00 · 2026-02-11 16:14:14 +01:00 · 2026-02-11 16:14:14 +01:00 · 7c9c2645e3
commit 7c9c2645e3
parent 60394076e5
1 changed files with 7 additions and 2 deletions
--- a/services/mana-stt/scripts/start-vllm-voxtral.sh
+++ b/services/mana-stt/scripts/start-vllm-voxtral.sh
@ -23,14 +23,19 @@ if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then
    exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
        --host 0.0.0.0 \
        --port "$PORT" \
-        --max-model-len 8192
+        --max-model-len 4096 \
+        --max-num-batched-tokens 4096 \
+        --enforce-eager
 else
    echo "Model: Voxtral Mini 3B"
+    # CPU mode needs smaller context and batched tokens
    exec vllm serve mistralai/Voxtral-Mini-3B-2507 \
        --tokenizer_mode mistral \
        --config_format mistral \
        --load_format mistral \
        --host 0.0.0.0 \
        --port "$PORT" \
-        --max-model-len 32768
+        --max-model-len 4096 \
+        --max-num-batched-tokens 4096 \
+        --enforce-eager
 fi