vllmserver启动的命令
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_P2P_DISABLE=0
export NCCL_IB_DISABLE=1
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-Next-80B-A3B-Instruct \
--served-model-name Qwen3-Next-80B-A3B-Instruct \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 8 \
--pipeline-parallel-size 1 \
--gpu-memory-utilization 0.90 \
--max-model-len 24576 \
--max-num-seqs 128 \
--max-num-batched-tokens 16384 \
--trust-remote-code \
--disable-log-requests \
--dtype bfloat16 \
--enable-prefix-caching \
--use-v2-block-manager \
--worker-use-ray \
--engine-use-ray \
--disable-custom-all-reduce
python调用的代码
#!/usr/bin/env python3
"""
直接调用vLLM服务器的Python程序
使用OpenAI客户端库调用本地vLLM服务器
"""
from openai import OpenAI
import asyncio
import json
from typing import List, Dict, Any, Optional
VLLM_BASE_URL = "http://localhost:8000/v1"
MODEL_NAME = "Qwen3-Next-80B-A3B-Instruct"
client = OpenAI(
base_url=VLLM_BASE_URL,
api_key="fake-key"
)
def simple_chat(prompt: str, system_message: str = "你是一个有用的AI助手。") -> str:
"""简单的聊天函数"""
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": prompt}
],
max_tokens=2048,
temperature=0.7,
top_p=0.9
)
return response.choices[0].message.content
except Exception as e:
return f"错误: {e}"
def chat_with_history(messages: List[Dict[str, str]], **kwargs) -> str:
"""带历史记录的聊天函数"""
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=kwargs.get('max_tokens', 2048),
temperature=kwargs.get('temperature', 0.7),
top_p=kwargs.get('top_p', 0.9),
stream=False
)
return response.choices[0].message.content
except Exception as e:
return f"错误: {e}"
def stream_chat(prompt: str, system_message: str = "你是一个有用的AI助手。"):
"""流式聊天函数"""
try:
messages = [
{"role": "system", "content": system_message},
{"role": "user", "content": prompt}
]
stream = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=2048,
temperature=0.7,
stream=True
)
print("AI回复: ", end="", flush=True)
full_response = ""
for chunk in stream:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
print(content, end="", flush=True)
full_response += content
print("\n")
return full_response
except Exception as e:
print(f"流式聊天错误: {e}")
return ""
def text_completion(prompt: str, **kwargs) -> str:
"""文本补全函数"""
try:
response = client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=kwargs.get('max_tokens', 1024),
temperature=kwargs.get('temperature', 0.7),
top_p=kwargs.get('top_p', 0.9)
)
return response.choices[0].text
except Exception as e:
return f"错误: {e}"
def get_models():
"""获取可用模型列表"""
try:
models = client.models.list()
return [model.id for model in models.data]
except Exception as e:
print(f"获取模型列表错误: {e}")
return []
def interactive_chat():
"""交互式聊天"""
print("=== vLLM 交互式聊天 ===")
print("输入 'exit' 或 'quit' 退出")
print("输入 'clear' 清空历史记录")
print("输入 'stream' 切换流式模式")
print("-" * 40)
history = []
stream_mode = False
while True:
try:
user_input = input("\n你: ").strip()
if user_input.lower() in ['exit', 'quit']:
print("再见!")
break
elif user_input.lower() == 'clear':
history = []
print("历史记录已清空")
continue
elif user_input.lower() == 'stream':
stream_mode = not stream_mode
print(f"流式模式: {'开启' if stream_mode else '关闭'}")
continue
elif not user_input:
continue
history.append({"role": "user", "content": user_input})
if stream_mode:
response = stream_chat_with_history(history)
else:
response = chat_with_history(history)
print(f"\nAI: {response}")
history.append({"role": "assistant", "content": response})
if len(history) > 20:
history = history[-20:]
except KeyboardInterrupt:
print("\n\n再见!")
break
except Exception as e:
print(f"\n错误: {e}")
def stream_chat_with_history(messages: List[Dict[str, str]]) -> str:
"""带历史记录的流式聊天"""
try:
stream = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=2048,
temperature=0.7,
stream=True
)
print("\nAI: ", end="", flush=True)
full_response = ""
for chunk in stream:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
print(content, end="", flush=True)
full_response += content
print()
return full_response
except Exception as e:
print(f"流式聊天错误: {e}")
return ""
def main():
"""主函数 - 演示各种调用方式"""
print("=== vLLM Python客户端测试 ===")
print("1. 检查服务器连接...")
models = get_models()
if models:
print(f"✓ 连接成功!可用模型: {models}")
else:
print("✗ 连接失败,请检查vLLM服务器是否运行")
return
print("\n2. 简单聊天测试...")
response = simple_chat("请简单介绍一下Python编程语言")
print(f"回复: {response}")
print("\n3. 流式聊天测试...")
stream_chat("请写一个Python快速排序算法")
print("\n4. 文本补全测试...")
completion = text_completion("def fibonacci(n):")
print(f"补全结果: {completion}")
print("\n5. 带参数的聊天测试...")
messages = [
{"role": "system", "content": "你是一个专业的Python教师。"},
{"role": "user", "content": "请解释Python中的装饰器概念"}
]
response = chat_with_history(messages, temperature=0.3, max_tokens=1000)
print(f"回复: {response}")
print("\n6. 启动交互式聊天...")
interactive_chat()
if __name__ == "__main__":
main()