🐛 fix(mana-stt): adjust vLLM config for CPU mode

- Reduce max-model-len to 4096 for CPU compatibility
- Add max-num-batched-tokens matching the context size
- Add enforce-eager for stable CPU inference

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Till-JS 2026-02-11 16:14:14 +01:00
parent 60394076e5
commit 7c9c2645e3

View file

@ -23,14 +23,19 @@ if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then
exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
--host 0.0.0.0 \
--port "$PORT" \
--max-model-len 8192
--max-model-len 4096 \
--max-num-batched-tokens 4096 \
--enforce-eager
else
echo "Model: Voxtral Mini 3B"
# CPU mode needs smaller context and batched tokens
exec vllm serve mistralai/Voxtral-Mini-3B-2507 \
--tokenizer_mode mistral \
--config_format mistral \
--load_format mistral \
--host 0.0.0.0 \
--port "$PORT" \
--max-model-len 32768
--max-model-len 4096 \
--max-num-batched-tokens 4096 \
--enforce-eager
fi