🚀 vLLM后台运行完整解决方案
⭐ 推荐方案:使用Screen会话
1. 快速启动(一条命令)
bash <(curl -s https://raw.githubusercontent.com/your-scripts/vllm-background.sh)
2. 手动设置Screen方案
sudo apt-get install screen
cat > start_vllm.sh << 'EOF'
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
export TOKENIZERS_PARALLELISM=false
screen -dmS vllm bash -c "
python3 -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-Next-80B-A3B-Instruct \
--served-model-name Qwen3-Next-80B-A3B-Instruct \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 8 \
--gpu-memory-utilization 0.88 \
--max-model-len 32768 \
--max-num-seqs 256 \
--max-num-batched-tokens 32768 \
--trust-remote-code \
--disable-log-requests \
--dtype bfloat16 \
--swap-space 32 \
--enable-chunked-prefill \
--disable-custom-all-reduce \
--block-size 32 2>&1 | tee ~/vllm.log
"
EOF
chmod +x start_vllm.sh
3. 启动和管理
./start_vllm.sh
screen -list
screen -r vllm
screen -S vllm -X quit
🛠️ 其他方案对比
方案1:nohup(最简单)
nohup python3 -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-Next-80B-A3B-Instruct \
--served-model-name Qwen3-Next-80B-A3B-Instruct \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 8 \
--gpu-memory-utilization 0.88 \
--max-model-len 32768 \
--max-num-seqs 256 \
--max-num-batched-tokens 32768 \
--trust-remote-code \
--disable-log-requests \
--dtype bfloat16 \
--swap-space 32 \
--enable-chunked-prefill \
--disable-custom-all-reduce \
--block-size 32 > vllm.log 2>&1 &
echo $! > vllm.pid
kill $(cat vllm.pid)
方案2:Systemd服务(生产环境推荐)
sudo tee /etc/systemd/system/vllm.service > /dev/null << EOF
[Unit]
Description=vLLM OpenAI API Server
After=network.target
[Service]
Type=simple
User=$(whoami)
WorkingDirectory=$(pwd)
Environment="CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
Environment="PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128"
Environment="TOKENIZERS_PARALLELISM=false"
ExecStart=/usr/bin/python3 -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-Next-80B-A3B-Instruct \
--served-model-name Qwen3-Next-80B-A3B-Instruct \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 8 \
--gpu-memory-utilization 0.88 \
--max-model-len 32768 \
--max-num-seqs 256 \
--max-num-batched-tokens 32768 \
--trust-remote-code \
--disable-log-requests \
--dtype bfloat16 \
--swap-space 32 \
--enable-chunked-prefill \
--disable-custom-all-reduce \
--block-size 32
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable vllm
sudo systemctl start vllm
sudo systemctl status vllm
sudo systemctl stop vllm
sudo journalctl -u vllm -f
📊 服务验证和监控
检查服务状态
netstat -tuln | grep 8000
ps aux | grep vllm
nvidia-smi
curl http://localhost:8000/health
curl http://localhost:8000/v1/models
性能监控脚本
cat > monitor_vllm.sh << 'EOF'
#!/bin/bash
while true; do
echo "=== $(date) ==="
echo "GPU状态:"
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total --format=csv
echo "API状态:"
curl -s http://localhost:8000/health && echo " ✅" || echo " ❌"
echo "进程状态:"
ps aux | grep vllm | grep -v grep | wc -l
echo "------------------------"
sleep 30
done
EOF
chmod +x monitor_vllm.sh
./monitor_vllm.sh
🎯 推荐使用流程
- 首次使用:选择Screen方案(易于调试和管理)
- 开发测试:使用nohup或Screen
- 生产部署:使用Systemd服务
- 高级用户:使用tmux(功能更强大)