安装docker环境
apt install [ docker.io] ( http://docker.io/)
安装docker-comopse环境
sudo ln -s /root/workspace/docker-compose-linux-x86_64 /usr/local/bin/docker-compose
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
| sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#' \
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime= docker
sudo systemctl restart docker
docker run --rm --gpus all nvidia/cuda:12.4.1-base-ubuntu22.04 nvidia-smi
下载models:Qwen3-Next-80B-A3B-Instruct
└── qwen3
├── LICENSE
├── README.md
├── config.json
├── generation_config.json
├── merges.txt
├── model-00001-of-00041.safetensors
├── model-00002-of-00041.safetensors
├── model-00003-of-00041.safetensors
├── model-00004-of-00041.safetensors
├── model-00005-of-00041.safetensors
├── model-00006-of-00041.safetensors
├── model-00007-of-00041.safetensors
├── model-00008-of-00041.safetensors
├── model-00009-of-00041.safetensors
├── model-00010-of-00041.safetensors
├── model-00011-of-00041.safetensors
├── model-00012-of-00041.safetensors
├── model-00013-of-00041.safetensors
├── model-00014-of-00041.safetensors
├── model-00015-of-00041.safetensors
├── model-00016-of-00041.safetensors
├── model-00017-of-00041.safetensors
├── model-00018-of-00041.safetensors
├── model-00019-of-00041.safetensors
├── model-00020-of-00041.safetensors
├── model-00021-of-00041.safetensors
├── model-00022-of-00041.safetensors
├── model-00023-of-00041.safetensors
├── model-00024-of-00041.safetensors
├── model-00025-of-00041.safetensors
├── model-00026-of-00041.safetensors
├── model-00027-of-00041.safetensors
├── model-00028-of-00041.safetensors
├── model-00029-of-00041.safetensors
├── model-00030-of-00041.safetensors
├── model-00031-of-00041.safetensors
├── model-00032-of-00041.safetensors
├── model-00033-of-00041.safetensors
├── model-00034-of-00041.safetensors
├── model-00035-of-00041.safetensors
├── model-00036-of-00041.safetensors
├── model-00037-of-00041.safetensors
├── model-00038-of-00041.safetensors
├── model-00039-of-00041.safetensors
├── model-00040-of-00041.safetensors
├── model-00041-of-00041.safetensors
├── model.safetensors.index.json
├── tokenizer.json
├── tokenizer_config.json
└── vocab.json
拷贝模型服务到服务器
cd Qwen80B_LLM_Server
docker-compose up -d --build
docker ps
root@iZ6webx89mskfrwmhgi2btZ:~/workspace/Qwen80B_LLM_Server
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
b08a84029791 nginx:alpine "/docker-entrypoint.…" 17 minutes ago Up 17 minutes 0.0 .0.0:80-> 80 /tcp, :::80-> 80 /tcp, 0.0 .0.0:443-> 443 /tcp, :::443-> 443 /tcp vllm-nginx
108eea5174d9 vllm-qwen3-vllm-server "python3 -m vllm.ent…" 17 minutes ago Up 17 minutes ( healthy) 0.0 .0.0:8000-> 8000 /tcp, :::8000-> 8000 /tcp qwen3-vllm-server
Qwen80B_LLM_Server的主要代码
DockerFile
FROM vllm/vllm-openai:latest
WORKDIR /workspace
RUN apt-get update && apt-get install -y \
curl \
git \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir --upgrade pip setuptools wheel
RUN pip install --no-cache-dir --upgrade \
transformers> = 4.45 .0 \
tokenizers> = 0.19 .0 \
accelerate> = 0.34 .0 \
torch> = 2.1 .0 \
torchvision \
torchaudio
RUN pip install --no-cache-dir --force-reinstall \
git+https://github.com/huggingface/transformers.git
RUN pip install --no-cache-dir --upgrade --force-reinstall vllm
RUN pip install --no-cache-dir \
huggingface-hub \
safetensors \
numpy \
scipy
RUN python3 -c "import transformers; print(f'Transformers version: {transformers.__version__}')" && \
python3 -c "import vllm; print(f'vLLM version: {vllm.__version__}')" && \
python3 -c "from transformers import AutoConfig; print('Transformers import successful')"
ENV PYTHONPATH = /workspace
ENV HF_HUB_ENABLE_HF_TRANSFER = 1
ENV CUDA_VISIBLE_DEVICES = 0,1 ,2,3,4,5,6,7
RUN mkdir -p /workspace/logs
COPY << EOF /workspace/healthcheck.sh
#!/bin/bash
curl -f http://localhost:8000/health || exit 1
EOF
RUN chmod +x /workspace/healthcheck.sh
EXPOSE 8000
CMD [ "python3" , "-m" , "vllm.entrypoints.openai.api_server" , "--help" ]
.env文件
VLLM_API_KEY = XXXXXXX
MODEL_PATH = /root/workspace/models
HOST_PORT = 8000
CUDA_VISIBLE_DEVICES = 0,1 ,2,3,4,5,6,7
COMPOSE_PROJECT_NAME = vllm-qwen3
docker-compose.yml
services:
vllm-server:
build:
context: .
dockerfile: Dockerfile
container_name: qwen3-vllm-server
restart: unless-stopped
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [ gpu]
environment:
- VLLM_API_KEY = ${VLLM_API_KEY}
- CUDA_VISIBLE_DEVICES = 0,1 ,2,3,4,5,6,7
- PYTHONPATH = /workspace
- HF_HUB_ENABLE_HF_TRANSFER = 1
- TRANSFORMERS_CACHE = /root/.cache/transformers
ports:
- "${HOST_PORT:- 8000} :8000"
volumes:
- ${MODEL_PATH} :/workspace/models:ro
- vllm_cache:/root/.cache
- ./logs:/workspace/logs
command: >
--model /workspace/models/qwen3
--served-model-name Qwen3-Next-80B-A3B-Instruct
--host 0.0 .0.0
--port 8000
--tensor-parallel-size 8
--gpu-memory-utilization 0.88
--max-model-len 32768
--max-num-seqs 256
--max-num-batched-tokens 32768
--trust-remote-code
--disable-log-requests
--dtype bfloat16
--swap-space 32
--enable-chunked-prefill
--disable-custom-all-reduce
--block-size 32
--api-key ${VLLM_API_KEY}
healthcheck:
test: [ "CMD" , "/workspace/healthcheck.sh" ]
interval: 30s
timeout: 15s
retries: 5
start_period: 900s
networks:
- vllm-network
volumes:
vllm_cache:
driver: local
networks:
vllm-network:
driver: bridge
测试服务启动是否正常
from openai import OpenAI
import time
import traceback
def run80B( prompt: str) :
print( "use run80B try" )
client = OpenAI(
api_key = "shgbitai@2025" ,
base_url = "http://8.216.37.224:8000/v1"
)
reasoning_content = ""
answer_content = ""
is_answering = False
try:
astart_time = time.perf_counter( )
completion = client.chat.completions.create(
model = "Qwen3-Next-80B-A3B-Instruct" ,
messages = [
{ "role" : "user" , "content" : prompt}
] ,
stream = True,
)
except Exception as e:
traceback.print_exc( )
answer_content = "API调用失败,请检查服务状态"
return answer_content
for chunk in completion:
if not chunk.choices:
if hasattr( chunk, 'usage' ) :
print( "\n Usage:" )
print( chunk.usage)
else:
delta = chunk.choices[ 0 ] .delta
if hasattr( delta, 'reasoning_content' ) and delta.reasoning_content is not None:
reasoning_content += delta.reasoning_content
else:
if delta.content and not is_answering:
is_answering = True
if delta.content:
answer_content += delta.content
aend_time = time.perf_counter( )
api_time = aend_time - astart_time
print( f"大模型调用完成,耗时: {api_time:.2f}秒" )
print( f"输出: {answer_content}" )
return answer_content
if __name__ == '__main__' :
result = run80B( "你是什么模型" )
print( "最终结果:" , result)