环境Ubuntu20.04
cuda_12.4
1. 从魔搭下载模型权重
modelscope download --model Qwen/Qwen2.5-7B-Instruct --local_dir ./Qwen2.5-7B-Instruct
2.可以克隆一下github官方demo
地址https://github.com/QwenLM/Qwen2.5
3.环境依赖包
conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.4 -c pytorch -c nvidia
4.装完pytorch后可以验证一下
python -c "import torch; print('PyTorch is using GPU version.' if torch.cuda.is_available() else 'PyTorch is using CPU version.')"
5.修改启动文件中的一些配置,然后运行官方的web_demo.py就可以访问了。
6.可以直接使用脚本就可以进行测试了
from modelscope import AutoModelForCausalLM, AutoTokenizer
model_name = "/home/sky/model_data/Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt = "请问你是谁呢"
messages = [
{"role": "system", "content": "你是一个美女AI助手,由CHE创造."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
7. 也可以使用vllm进行启动
首先要pip install vllm 我使用的版本为0.6.3.post1
我再次上传一个依赖文件
vllm 命令启动
python -m vllm.entrypoints.openai.api_server --model /home/sky/model_data/Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 6523 --dtype bfloat16
测试
curl http://localhost:6523/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "/home/sky/model_data/Qwen/Qwen2.5-7B-Instruct",
"messages": [
{"role": "user", "content": "Tell me something about large language models."}
],
"temperature": 0.7,
"top_p": 0.8,
"repetition_penalty": 1.05,
"max_tokens": 512
}'
vllm代码启动
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
tokenizer = AutoTokenizer.from_pretrained("/home/sky/model_data/Qwen/Qwen2.5-7B-Instruct")
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
llm = LLM(model="/home/sky/model_data/Qwen/Qwen2.5-7B-Instruct")
prompt = "请告诉我你是谁,然后再介绍一下上海."
messages = [
{"role": "system", "content": "你是小智,是一个大模型AI助手."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# generate outputs
outputs = llm.generate([text], sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
8.使用Transformers进行推理
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
device = torch.device("cuda")
# 指定模型名称
model_name = "/home/sky/model_data/Qwen/Qwen2.5-7B-Instruct"
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 加载模型
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
# 准备输入文本
input_text = "你好,世界!"
inputs = tokenizer(input_text, return_tensors="pt").to(device)
# 进行推理
with torch.no_grad():
outputs = model.generate(inputs["input_ids"], max_length=50)
# 解码输出
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
我直接附上一个依赖文件可以直接下载