mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-17 15:49:40 +02:00
- Reduce max-model-len to 4096 for CPU compatibility - Add max-num-batched-tokens matching the context size - Add enforce-eager for stable CPU inference Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
41 lines
1.1 KiB
Bash
Executable file
41 lines
1.1 KiB
Bash
Executable file
#!/bin/bash
|
|
# Start vLLM server for Voxtral
|
|
#
|
|
# Usage: ./scripts/start-vllm-voxtral.sh [model]
|
|
# model: "3b" (default) or "4b" for Realtime
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
|
|
VENV_DIR="$SERVICE_DIR/.venv-vllm"
|
|
MODEL="${1:-3b}"
|
|
PORT="${VLLM_PORT:-8100}"
|
|
|
|
# Activate venv
|
|
source "$VENV_DIR/bin/activate"
|
|
|
|
echo "Starting vLLM Voxtral server..."
|
|
echo "Port: $PORT"
|
|
|
|
if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then
|
|
echo "Model: Voxtral Mini 4B Realtime"
|
|
exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
|
|
--host 0.0.0.0 \
|
|
--port "$PORT" \
|
|
--max-model-len 4096 \
|
|
--max-num-batched-tokens 4096 \
|
|
--enforce-eager
|
|
else
|
|
echo "Model: Voxtral Mini 3B"
|
|
# CPU mode needs smaller context and batched tokens
|
|
exec vllm serve mistralai/Voxtral-Mini-3B-2507 \
|
|
--tokenizer_mode mistral \
|
|
--config_format mistral \
|
|
--load_format mistral \
|
|
--host 0.0.0.0 \
|
|
--port "$PORT" \
|
|
--max-model-len 4096 \
|
|
--max-num-batched-tokens 4096 \
|
|
--enforce-eager
|
|
fi
|