注意:vllm只能在linux使用
默认情况下,vLLM 从 HuggingFace 下载模型。如果您想在以下示例中使用 ModelScope 中的模型,请设置环境变量:
export VLLM_USE_MODELSCOPE=True
使用modelscope下载
pip install modelscope
modelscope download --model ZhipuAI/glm-4-9b-chat --local_dir /root/autodl-tmp/models/glm-4-9b-chat
下载
pip install vllm
起服务
vllm serve -model
或者
python -m vllm.entrypoints.openai.api_server --model /root/autodl-tmp/models/glm-4-9b-chat --tensor-parallel-size 2 --max-model-len 8192 --trust-remote-code --gpu_memory_utilization 0.9
测试
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "facebook/opt-125m",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
}'
from openai import OpenAI
import time
def demo_infer():
#start_time = time.time()
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
completion = client.chat.completions.create(
model="/root/autodl-tmp/models/glm-4-9b-chat",
messages=[
{"role": "user", "content": "鲁迅为啥暴打周树人?"}
]
)
stop_time = time.time()
#time = stop_time - start_time
#print(f"推理时间:{time}\n")
print(completion.choices[0].message)
if __name__ == "__main__":
time_1 = time.time()
x = 20
for i in range(x):
demo_infer()
time_2 = time.time()
time_3 = time_2 - time_1
print(f"共{x}个问题,共花了{time_3}s")
from openai import OpenAI
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
def demo_infer():
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
completion = client.chat.completions.create(
model="/root/autodl-tmp/models/glm-4-9b-chat",
messages=[
{"role": "user", "content": "鲁迅为啥暴打周树人?"}
]
)
print(completion.choices[0].message)
def main():
time_1 = time.time()
x = 20 # 并发执行的任务数量
with ThreadPoolExecutor(max_workers=x) as executor:
# 提交任务到线程池
futures = [executor.submit(demo_infer) for _ in range(x)]
# 等待所有任务完成
for future in as_completed(futures):
future.result() # 获取结果,这里不是必须的,只是为了等待所有任务完成
time_2 = time.time()
time_3 = time_2 - time_1
print(f"共{x}个问题,共花了{time_3}s")
if __name__ == "__main__":
main()
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
# 使用 vLLM 的 API 服务器需要修改 OpenAI 的 API 密钥和 API 库。
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
completion = client.completions.create(model="facebook/opt-125m",
prompt="San Francisco is a")
print("Completion result:", completion)
提示词
from openai import OpenAI
# 使用 vLLM 的 API 服务器需要修改 OpenAI 的 API 密钥和 API 库。
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# 添加系统角色信息和用户提示
system_message = {"role": "system", "content": "你是小智,由杨子贤开发.是你的智能助手."}
user_prompt = {"role": "user", "content": "你是谁"}
completion = client.chat.completions.create(
model="output/qwen2_5-0_5b-instruct/v0-20241219-112452/checkpoint-92-merged",
messages=[system_message, user_prompt] # 将系统消息和用户提示作为列表传递
)
print("Completion result:", completion)
量化
量化内核支持的硬件
下表展示了 vLLM 中各种量化实现与不同硬件平台的兼容性情况:
Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Inferentia | Google TPU |
---|---|---|---|---|---|---|---|---|---|---|
AWQ | ✗ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✗ | ✗ | ✅︎ | ✗ | ✗ |
GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✗ | ✗ | ✗ | ✗ | ✗ |
Marlin (GPTQ/AWQ/FP8) | ✗ | ✗ | ✅︎ | ✅︎ | ✅︎ | ✗ | ✗ | ✗ | ✗ | ✗ |
INT8 (W8A8) | ✗ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✗ | ✗ | ✅︎ | ✗ | ✗ |
FP8 (W8A8) | ✗ | ✗ | ✗ | ✅︎ | ✅︎ | ✅︎ | ✗ | ✗ | ✗ | ✗ |
AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✗ | ✗ | ✗ | ✗ | ✗ |
bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✗ | ✗ | ✗ | ✗ | ✗ |
DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✗ | ✗ | ✗ | ✗ | ✗ |
GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✗ | ✗ | ✗ | ✗ | ✗ |
AutoAWQ
安装 AutoAWQ
pip install autoawq
安装 AutoAWQ 后,您就可以对模型进行量化了。以下是一个如何量化 mistralai/Mistral-7B-Instruct-v0.2 的示例:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
quant_path = 'mistral-instruct-v0.2-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
# Load model
# 加载模型
model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Quantize
# 量化
model.quantize(tokenizer, quant_config=quant_config)
# Save quantized model
# 保存量化模型
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
如需使用 vLLM 运行一个 AWQ 模型,您可以使用 TheBloke/Llama-2-7b-Chat-AWQ,并配合以下命令:
python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
BitsAndBytes
vLLM 现在支持 BitsAndBytes 以实现更高效的模型推理。 BitsAndBytes 量化模型可以减少内存使用并增强性能,且不会明显降低准确性。与其他量化方法相比,BitsAndBytes 无需使用输入数据来校准量化模型。
pip install bitsandbytes>=0.42.0
读取量化 checkpoint
from vllm import LLM
import torch
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
# unsloth/tinyllama-bnb-4bit 是一个预量化的 checkpoint。
model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes")
过程中量化:加载为 4 位量化
from vllm import LLM
import torch
model_id = "huggyllama/llama-7b"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes")