LLaMa的推理代码
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
# 设置设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 加载Llama 2模型和分词器
model_name = "meta-llama/Llama-2-7b-hf" # 根据需要选择模型
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name).half().to(device)
# 准备输入数据
input_text = "请写一个关于自然的故事。" # 你的输入文本
inputs = tokenizer(input_text, return_tensors="pt", max_length=500, truncation=True)
# 将输入数据移动到CUDA设备
inputs = {key: value.to(device) for key, value in inputs.items()}
# 执行推理
with torch.no_grad(): # 禁用梯度计算
outputs = model.generate(**inputs, max_length=500)
# 处理输出
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# 打印结果
print(generated_text)