mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-15 16:39:39 +02:00
- Add vllm_service.py as proxy to vLLM server for Voxtral 3B/4B - Add voxtral_api_service.py for Mistral API fallback - Update main.py with /transcribe/voxtral endpoint using vLLM - Add /transcribe/auto endpoint with automatic fallback chain - Create setup-vllm.sh and start-vllm-voxtral.sh scripts - Add launchd plist files for Mac Mini deployment - Add install-services.sh for automated service installation Architecture: - vLLM server runs Voxtral models on port 8100 - mana-stt proxies to vLLM with Mistral API fallback - Fallback chain: vLLM -> Mistral API Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
83 lines
2.5 KiB
Bash
Executable file
83 lines
2.5 KiB
Bash
Executable file
#!/bin/bash
|
|
# Setup vLLM for Voxtral on Mac Mini M4
|
|
#
|
|
# vLLM runs in CPU mode on macOS (no CUDA), but still provides
|
|
# the optimized inference pipeline for Voxtral models.
|
|
#
|
|
# Usage: ./scripts/setup-vllm.sh
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
|
|
VENV_DIR="$SERVICE_DIR/.venv-vllm"
|
|
|
|
echo "============================================"
|
|
echo "vLLM Setup for Voxtral on Mac Mini M4"
|
|
echo "============================================"
|
|
echo ""
|
|
|
|
# Check Python version
|
|
PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}')
|
|
PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
|
|
PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
|
|
|
|
if [[ "$PYTHON_MAJOR" -lt 3 ]] || [[ "$PYTHON_MAJOR" -eq 3 && "$PYTHON_MINOR" -lt 10 ]]; then
|
|
echo "Error: Python 3.10+ required (found $PYTHON_VERSION)"
|
|
exit 1
|
|
fi
|
|
echo "Python version: $PYTHON_VERSION"
|
|
|
|
# Create separate venv for vLLM (to avoid conflicts with whisper)
|
|
echo ""
|
|
echo "Creating virtual environment for vLLM..."
|
|
python3 -m venv "$VENV_DIR"
|
|
source "$VENV_DIR/bin/activate"
|
|
|
|
# Upgrade pip
|
|
pip install --upgrade pip --quiet
|
|
|
|
# Install vLLM with audio support
|
|
echo ""
|
|
echo "Installing vLLM with audio support..."
|
|
echo "This may take a few minutes..."
|
|
|
|
# Install uv for faster package installation
|
|
pip install uv --quiet
|
|
|
|
# Install vLLM with audio support (nightly for best Voxtral support)
|
|
uv pip install "vllm[audio]>=0.10.0" --extra-index-url https://wheels.vllm.ai/nightly 2>&1 || {
|
|
echo "Nightly install failed, trying stable..."
|
|
uv pip install "vllm[audio]>=0.10.0"
|
|
}
|
|
|
|
# Install mistral-common with audio
|
|
uv pip install "mistral-common[audio]>=1.8.1"
|
|
|
|
echo ""
|
|
echo "============================================"
|
|
echo "Installation complete!"
|
|
echo "============================================"
|
|
echo ""
|
|
echo "To start Voxtral Mini 3B server:"
|
|
echo " source $VENV_DIR/bin/activate"
|
|
echo " vllm serve mistralai/Voxtral-Mini-3B-2507 \\"
|
|
echo " --tokenizer_mode mistral \\"
|
|
echo " --config_format mistral \\"
|
|
echo " --load_format mistral \\"
|
|
echo " --host 0.0.0.0 \\"
|
|
echo " --port 8100"
|
|
echo ""
|
|
echo "To start Voxtral Realtime 4B server:"
|
|
echo " source $VENV_DIR/bin/activate"
|
|
echo " vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \\"
|
|
echo " --host 0.0.0.0 \\"
|
|
echo " --port 8100"
|
|
echo ""
|
|
echo "API Endpoint: http://localhost:8100/v1/audio/transcriptions"
|
|
echo ""
|
|
echo "Test with:"
|
|
echo " curl http://localhost:8100/v1/audio/transcriptions \\"
|
|
echo " -F file=@test.mp3 \\"
|
|
echo " -F model=mistralai/Voxtral-Mini-3B-2507 \\"
|
|
echo " -F language=de"
|