mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 21:41:09 +02:00
🐛 fix(mana-stt): adjust vLLM config for CPU mode
- Reduce max-model-len to 4096 for CPU compatibility - Add max-num-batched-tokens matching the context size - Add enforce-eager for stable CPU inference Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
60394076e5
commit
7c9c2645e3
1 changed files with 7 additions and 2 deletions
|
|
@ -23,14 +23,19 @@ if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then
|
|||
exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
|
||||
--host 0.0.0.0 \
|
||||
--port "$PORT" \
|
||||
--max-model-len 8192
|
||||
--max-model-len 4096 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--enforce-eager
|
||||
else
|
||||
echo "Model: Voxtral Mini 3B"
|
||||
# CPU mode needs smaller context and batched tokens
|
||||
exec vllm serve mistralai/Voxtral-Mini-3B-2507 \
|
||||
--tokenizer_mode mistral \
|
||||
--config_format mistral \
|
||||
--load_format mistral \
|
||||
--host 0.0.0.0 \
|
||||
--port "$PORT" \
|
||||
--max-model-len 32768
|
||||
--max-model-len 4096 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--enforce-eager
|
||||
fi
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue