1.用colab的云服务器进行训练,但是这个会断,不稳定(优点是环境安装很简单,不会报错)
1)下载模型
!git lfs install
!git clone https://huggingface.co/Qwen/Qwen2.5-3B-Instruct
2)下载代码中用到的库
! pip install vllm
3)加载 Qwen2.5-3B-Instruct 模型,并进行 LoRA 低秩适配微调,以优化 GPU 内存占用,并加速推理。
from unsloth import FastLanguageModel # PatchFastRL 补丁,官方好像已经修复了不需要了。
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 512
lora_rank = 32
# 导入模型和分词器初始化
model,tokenizer = FastLanguageModel.from_pretrained(
model_name = "/content/Qwen2.5-3B-Instruct",
max_seq_length = max_seq_length,
max_lora_rank = lora_rank,
load_in_4bit = True, # 开启4bit
load_in_8bit = False, # 关闭8bit
full_finetuning = False, # 关闭全量微调
fast_inference = True, # 启用 vLLM 快速推理功能
gpu_memory_utilization = 0.8, # 设置 GPU 内存利用率的上限为 80%
)
4)使用 LoRA(Low-Rank Adaptation)技术对 Qwen2.5-3B-Instruct
进行参数高效微调(PEFT,Parameter Efficient Fine-Tuning
# 定义model
model=FastLanguageModel.get_peft_model(
model,
# 使用LoRA微调
r = 16, # 低秩矩阵的秩(rank),增大 r 的值可以使模型在训练时能够捕捉到更多细节,从而提高准确率,但同时也增加了过拟合的风险。
lora_alpha = lora_rank, # 控制低秩矩阵的缩放因子,配合 r 使用,参数推荐要大于等于 r 这个参数可以帮助模型更好地平衡权重更新的幅度,从而在训练过程中更稳定地收敛。
lora_dropout=0, # Dropout是一种正则化技术,用于防止模型过拟合。在这个场景中,将dropout率设置为0表示在LoRA层中不使用dropout
random_state=3407, # 设置随机种子可以确保每次运行代码时模型的初始化是相同的,从而保证实验结果的可重复性。
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
], # 指定需要应用 LoRA 的模型模块
)
5)加载 OpenAI gsm8k
数据集,并转换为适用于训练的格式,包括标准化问题和答案,使其适用于 Qwen2.5-3B-Instruct
进行 监督微调 或 强化学习训练
!pip install datasets
import re
from datasets import load_dataset, Dataset
# 给学生模型定一个标准。使用以下格式进行回复,让学生明确知道输出格式的要求,避免乱写。在<reasoning>中显示推理过程,再回答
SYSTEM_PROMPT ="""
按照以下格式进行回复用户
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
# 让学生模型进行回答的格式(答题卡),相应的内容放到指定位置。再根据老师标准进行格式评分,获取答案区内容对比数据集答案评分
XML_COT_FORMAT="""
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""
# 在学生的答题卡中提取学生模型的答案,文本内容输入的是字符串类型,输出的也是str字符串类型
def extract_xml_answer(text: str) -> str:
answer = text.split('<answer>')[-1]
answer = answer.split('</answer>')[0]
return answer.strip() # 缺少此行会导致返回 None
# 因为openai/gsm8k数据集中的答案是在####后面的。所以要进行处理下
def extract_hash_answer(text:str) -> str:
if "####" not in text:
return None
return text.split("####")[1].strip()
def get_gsm8k_questions(split = "train") -> Dataset:
data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
data = data.shuffle(seed=42).select(range(1000))
data = data.map(lambda x: { # type: ignore
'prompt': [
{'role': 'system', 'content': SYSTEM_PROMPT},
{'role': 'user', 'content': x['question']}
],
'answer': extract_hash_answer(x['answer'])
}) # type: ignore
return data # type: ignore
dataset = get_gsm8k_questions()
6)使用 GRPO (Guided Reward Policy Optimization) 训练 Qwen2.5-3B-Instruct
模型,以优化它的数学推理和格式化回答能力。具体来说,它 加载数据集、设计奖励函数,并使用 GRPO 进行强化学习训练。
# 设计奖励函数
# 答案正确性奖励。prompts老师布置的题目,completions学生写的答案,answer标准答案。
# **kwargs可选的关键字参数,用于传递额外的参数。返回值是一个浮点数列表 list[float],表示每个回答的奖励值
def correctness_reward_func(prompts,completions,answer,**kwargs) -> list[float]:
# 步骤1:提取模型生成的文本内容
# 作用:将模型的输出结构(如[{'role':'assistant','content':...}])转换为纯文本列表
# 例如:将 [{'content':"<answer>42</answer>"}] 转换为 ["<answer>42</answer>"]
responses = [completion[0]['content'] for completion in completions]
# 步骤2:提取当前问题
# 如果 prompts[0] 是 [{'role': 'system', 'content': 'System message'}, {'role': 'user', 'content': 'User question'}],
# 那么 prompts[0][-1] 就是 {'role': 'user', 'content': 'User question'}。
q = prompts[0][-1]['content']
# 步骤3:解析模型回答中的结构化内容
# 作用:从类似 "<answer>42</answer>" 的文本中提取数字42
# 假设 extract_xml_answer 是自定义函数,用于标准化答案格式
extracted_responses = [extract_xml_answer(r) for r in responses]
# 步骤4:调试打印(非必要,但方便观察)
# 显示样本对比,例如:
# --------------------
# 师布置的题目: 计算2+2等于几
# 标准答案: 4
# 学生答案: <answer>4</answer>
# 解析学生结构内容: 4
print('-'*30, f"老师布置的题目:\n{q}", f"\n标准答案:\n{answer[0]}", f"\n学生答案:\n{responses[0]}", f"\n解析学生结构内容:\n{extracted_responses[0]}")
# 步骤5:计算得分
# 核心逻辑:严格匹配标准答案则给2分,否则0分
# 例如:
# 提取答案4 vs 标准答案4 → 2.0分
# 提取答案5 vs 标准答案4 → 0.0分
return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses,answer) ]
# 输出整数奖励
def int_reward_func(completions,**kwargs) -> list[float]:
# 步骤1:提取模型生成的文本内容
# 作用:将模型的输出结构(如[{'role':'assistant','content':...}])转换为纯文本列表
# 例如:将 [{'content':"<answer>42</answer>"}] 转换为 ["<answer>42</answer>"]
responses = [completion[0]['content'] for completion in completions]
# 步骤2:解析模型回答中的结构化内容
# extracted_responses是一个列表["24","三十","文字"等等],学生答案都在里面。
extracted_responses = [extract_xml_answer(r) for r in responses]
# 符合整数数字(.isdigit())的24才会得0.5分,三十,文字都得0分:[0.5,0.0,0.0]
return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]
# 严格的奖励函数
def strict_format_reward_func(completions,**kwargs) -> list[float]:
pattern = r"<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
responses = [completion[0]["content"] for completion in completions]
# pattern 格式和responses 输出格式对比。
matches = [re.match(pattern, r) for r in responses]
return [0.5 if match else 0.0 for match in matches]
# 宽松的奖励函数
def soft_format_reward_func(completions,**kwargs) -> list[float]:
pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
responses = [completion[0]["content"] for completion in completions]
matches = [re.match(pattern, r) for r in responses]
return [0.5 if match else 0.0 for match in matches]
# 某些特定标签的出现次数和位置的奖励函数
def count_xml(text) -> float:
count = 0.0
if text.count("<reasoning>\n") == 1:
count += 0.125
if text.count("\n</reasoning>\n") == 1:
count += 0.125
if text.count("\n<answer>\n") == 1:
count += 0.125
count -= len(text.split("\n</answer>\n")[-1]) * 0.001
if text.count("\n</answer>") == 1:
count += 0.125
count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
return count
# 使用学生答案格式对比评分
def xmlcount_reward_func(completions,**kwargs) -> list[float]:
contents = [completion[0]["content"] for completion in completions]
return [count_xml(c) for c in contents]
from trl import GRPOConfig, GRPOTrainer
# 初始化 wandb(可选)
import wandb
wandb.init(
project="wt11",
name="wt11",
# id="76t", # 从WandB网页复制原有运行的ID
# resume="allow" # 允许恢复已有运行
)
max_prompt_length = 256
training_args = GRPOConfig(
# 加速与内存管理
use_vllm=True, # 启用vLLM推理加速框架(显著提升吞吐量)
per_device_train_batch_size=4, # 每个设备的训练批次大小(根据显存调整,若OOM可降低)
gradient_accumulation_steps=4, # 梯度累积步数(模拟更大batch size,设置为4可提升稳定性)
# 优化器配置
learning_rate=3e-4, # 初始学习率(常用范围:1e-5到1e-4,大模型可更低)
optim="paged_adamw_8bit", # 使用8bit分页AdamW优化器(节省显存)
adam_beta1=0.9, # Adam的一阶矩衰减率(通常保持0.9)
adam_beta2=0.98, # Adam的二阶矩衰减率(通常0.999,大模型可调低如0.95)
weight_decay=0.05, # 权重衰减系数(防过拟合,常用0.1-0.01)
max_grad_norm=0.2, # 梯度裁剪阈值(防止梯度爆炸)
# 学习率调度
lr_scheduler_type="cosine", # 余弦退火调度(适合收敛平稳)
warmup_ratio=0.08, # 学习率预热比例(前10% step线性增长学习率)
# 精度与内存
bf16=is_bfloat16_supported(),# 优先用bfloat16(A100/V100等支持)
fp16=not is_bfloat16_supported(), # 不支持bf16时用fp16(注意混合精度稳定性)
# 训练控制
num_train_epochs=3, # 训练轮数(与max_steps二选一)
max_prompt_length=max_prompt_length, # 输入提示最大长度(超出部分截断)
max_completion_length=max_seq_length - max_prompt_length, # 生成内容最大长度(影响内存占用)
num_generations=16, # 评估时的生成样本数(减少可节省显存)
# 日志与保存
logging_steps=1, # 每1步记录日志(调试时可设为1,常规训练可调大)
save_steps=20, # 每200步保存检查点(根据存储空间调整)
report_to="wandb", # 集成Weights & Biases可视化工具(需提前登录)
output_dir="outputs", # 模型和日志输出目录
)
trainer = GRPOTrainer(
model = model,
processing_class = tokenizer,
reward_funcs = [
xmlcount_reward_func,
soft_format_reward_func,
strict_format_reward_func,
int_reward_func,
correctness_reward_func,
],
args = training_args,
train_dataset = dataset,
)
trainer.train()
# trainer.train(resume_from_checkpoint=True) 短点连接训练
model.save_lora("grpo_saved_lora")
model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
2.魔塔社区免费gpu训练过程
1)安装conda环境
wget https://repo.anaconda.com/archive/Anaconda3-2024.02-1-Linux-x86_64.sh
chmod +x Anaconda3-2024.02-1-Linux-x86_64.sh
./Anaconda3-2024.02-1-Linux-x86_64.sh
nano ~/.bashrc
export PATH="/root/anaconda3/bin:$PATH"
source ~/.bashrc
conda create -n unsloth1 python=3.10
conda init
conda activate unsloth1
pip install torch==2.4.0 torchvision==0.19.0
pip install transformers==4.45.0
pip install unsloth
pip install vllm
pip install wandb
pip install modelscope
2)下载模型
pip install modelscope
#指定目录为当前目录下
modelscope download --model Qwen/Qwen2.5-3B-Instruct --output ./
3)阿里云这个环境下载openai的数据集会报错,需要在本机提前下载下来,然后压缩,将压缩后的main弄到py文件同级目录下
from datasets import load_dataset
dataset = load_dataset('openai/gsm8k', 'main')
4)训练py文件
from unsloth import FastLanguageModel # PatchFastRL 补丁,官方好像已经修复了不需要了。
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 512
lora_rank = 32
# 导入模型和分词器初始化
model,tokenizer = FastLanguageModel.from_pretrained(
model_name = "Qwen2.5-3B-Instruct/Qwen/Qwen2.5-3B-Instruct", # 老师模型,也是监督模型SFT
max_seq_length = max_seq_length,
max_lora_rank = lora_rank,
load_in_4bit = True, # 开启4bit
load_in_8bit = False, # 关闭8bit
full_finetuning = False, # 关闭全量微调
fast_inference = True, # 启用 vLLM 快速推理功能
gpu_memory_utilization = 0.7, # 设置 GPU 内存利用率的上限为 80%
)
# 定义model
model=FastLanguageModel.get_peft_model(
model,
# 使用LoRA微调
r = 16, # 低秩矩阵的秩(rank),增大 r 的值可以使模型在训练时能够捕捉到更多细节,从而提高准确率,但同时也增加了过拟合的风险。
lora_alpha = lora_rank, # 控制低秩矩阵的缩放因子,配合 r 使用,参数推荐要大于等于 r 这个参数可以帮助模型更好地平衡权重更新的幅度,从而在训练过程中更稳定地收敛。
lora_dropout=0, # Dropout是一种正则化技术,用于防止模型过拟合。在这个场景中,将dropout率设置为0表示在LoRA层中不使用dropout
random_state=3407, # 设置随机种子可以确保每次运行代码时模型的初始化是相同的,从而保证实验结果的可重复性。
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
], # 指定需要应用 LoRA 的模型模块
)
import re
from datasets import load_dataset, Dataset
# 给学生模型定一个标准。使用以下格式进行回复,让学生明确知道输出格式的要求,避免乱写。在<reasoning>中显示推理过程,再回答
SYSTEM_PROMPT ="""
按照以下格式进行回复用户
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
# 让学生模型进行回答的格式(答题卡),相应的内容放到指定位置。再根据老师标准进行格式评分,获取答案区内容对比数据集答案评分
XML_COT_FORMAT="""
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""
def extract_hash_answer(text:str) -> str:
if "####" not in text:
return None
return text.split("####")[1].strip()
from datasets import Dataset
def extract_xml_answer(text: str) -> str:
if not text or "<answer>" not in text or "</answer>" not in text:
return ""
answer = text.split('<answer>')[-1].split('</answer>')[0]
return answer.strip()
def get_gsm8k_questions(dataset_path: str) -> Dataset:
# 从本地 `.arrow` 文件加载数据
data = Dataset.from_file(dataset_path)
# 随机打乱数据,并选取前 1000 条数据(如果数据不够 1000 条,则取全部)
data = data.shuffle(seed=42).select(range(min(1000, len(data))))
# 处理 `question` 和 `answer` 字段
data = data.map(lambda x: {
'prompt': [
{'role': 'system', 'content': SYSTEM_PROMPT},
{'role': 'user', 'content': x['question']}
],
'answer': extract_hash_answer(x['answer'])
})
return data
# 运行
dataset_path = "main/0.0.0/e53f048856ff4f594e959d75785d2c2d37b678ee/gsm8k-train.arrow"
dataset = get_gsm8k_questions(dataset_path)
def correctness_reward_func(prompts,completions,answer,**kwargs) -> list[float]:
# 步骤1:提取模型生成的文本内容
# 作用:将模型的输出结构(如[{'role':'assistant','content':...}])转换为纯文本列表
# 例如:将 [{'content':"<answer>42</answer>"}] 转换为 ["<answer>42</answer>"]
responses = [completion[0]['content'] for completion in completions]
# 步骤2:提取当前问题
# 如果 prompts[0] 是 [{'role': 'system', 'content': 'System message'}, {'role': 'user', 'content': 'User question'}],
# 那么 prompts[0][-1] 就是 {'role': 'user', 'content': 'User question'}。
q = prompts[0][-1]['content']
# 步骤3:解析模型回答中的结构化内容
# 作用:从类似 "<answer>42</answer>" 的文本中提取数字42
# 假设 extract_xml_answer 是自定义函数,用于标准化答案格式
extracted_responses = [extract_xml_answer(r) for r in responses]
# 步骤4:调试打印(非必要,但方便观察)
# 显示样本对比,例如:
# --------------------
# 师布置的题目: 计算2+2等于几
# 标准答案: 4
# 学生答案: <answer>4</answer>
# 解析学生结构内容: 4
print('-'*30, f"老师布置的题目:\n{q}", f"\n标准答案:\n{answer[0]}", f"\n学生答案:\n{responses[0]}", f"\n解析学生结构内容:\n{extracted_responses[0]}")
# 步骤5:计算得分
# 核心逻辑:严格匹配标准答案则给2分,否则0分
# 例如:
# 提取答案4 vs 标准答案4 → 2.0分
# 提取答案5 vs 标准答案4 → 0.0分
return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses,answer) ]
# 输出整数奖励
def int_reward_func(completions,**kwargs) -> list[float]:
# 步骤1:提取模型生成的文本内容
# 作用:将模型的输出结构(如[{'role':'assistant','content':...}])转换为纯文本列表
# 例如:将 [{'content':"<answer>42</answer>"}] 转换为 ["<answer>42</answer>"]
responses = [completion[0]['content'] for completion in completions]
# 步骤2:解析模型回答中的结构化内容
# extracted_responses是一个列表["24","三十","文字"等等],学生答案都在里面。
extracted_responses = [extract_xml_answer(r) for r in responses]
# 符合整数数字(.isdigit())的24才会得0.5分,三十,文字都得0分:[0.5,0.0,0.0]
return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]
# 严格的奖励函数
def strict_format_reward_func(completions,**kwargs) -> list[float]:
pattern = r"<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
responses = [completion[0]["content"] for completion in completions]
# pattern 格式和responses 输出格式对比。
matches = [re.match(pattern, r) for r in responses]
return [0.5 if match else 0.0 for match in matches]
# 宽松的奖励函数
def soft_format_reward_func(completions,**kwargs) -> list[float]:
pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
responses = [completion[0]["content"] for completion in completions]
matches = [re.match(pattern, r) for r in responses]
return [0.5 if match else 0.0 for match in matches]
# 某些特定标签的出现次数和位置的奖励函数
def count_xml(text) -> float:
count = 0.0
if text.count("<reasoning>\n") == 1:
count += 0.125
if text.count("\n</reasoning>\n") == 1:
count += 0.125
if text.count("\n<answer>\n") == 1:
count += 0.125
count -= len(text.split("\n</answer>\n")[-1]) * 0.001
if text.count("\n</answer>") == 1:
count += 0.125
count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
return count
# 使用学生答案格式对比评分
def xmlcount_reward_func(completions,**kwargs) -> list[float]:
contents = [completion[0]["content"] for completion in completions]
return [count_xml(c) for c in contents]
from trl import GRPOConfig, GRPOTrainer
# 初始化 wandb(可选)
import wandb
wandb.init(
project="wt11",
name="wt11",
# id="76t", # 从WandB网页复制原有运行的ID
# resume="allow" # 允许恢复已有运行
)
max_prompt_length = 256
training_args = GRPOConfig(
# 加速与内存管理
use_vllm=True, # 启用vLLM推理加速框架(显著提升吞吐量)
per_device_train_batch_size=4, # 每个设备的训练批次大小(根据显存调整,若OOM可降低)
gradient_accumulation_steps=4, # 梯度累积步数(模拟更大batch size,设置为4可提升稳定性)
# 优化器配置
learning_rate=3e-4, # 初始学习率(常用范围:1e-5到1e-4,大模型可更低)
optim="paged_adamw_8bit", # 使用8bit分页AdamW优化器(节省显存)
adam_beta1=0.9, # Adam的一阶矩衰减率(通常保持0.9)
adam_beta2=0.98, # Adam的二阶矩衰减率(通常0.999,大模型可调低如0.95)
weight_decay=0.05, # 权重衰减系数(防过拟合,常用0.1-0.01)
max_grad_norm=0.2, # 梯度裁剪阈值(防止梯度爆炸)
# 学习率调度
lr_scheduler_type="cosine", # 余弦退火调度(适合收敛平稳)
warmup_ratio=0.08, # 学习率预热比例(前10% step线性增长学习率)
# 精度与内存
bf16=is_bfloat16_supported(),# 优先用bfloat16(A100/V100等支持)
fp16=not is_bfloat16_supported(), # 不支持bf16时用fp16(注意混合精度稳定性)
# 训练控制
num_train_epochs=3, # 训练轮数(与max_steps二选一)
max_prompt_length=max_prompt_length, # 输入提示最大长度(超出部分截断)
max_completion_length=max_seq_length - max_prompt_length, # 生成内容最大长度(影响内存占用)
num_generations=16, # 评估时的生成样本数(减少可节省显存)
# 日志与保存
logging_steps=1, # 每1步记录日志(调试时可设为1,常规训练可调大)
save_steps=20, # 每200步保存检查点(根据存储空间调整)
report_to="wandb", # 集成Weights & Biases可视化工具(需提前登录)
output_dir="outputs", # 模型和日志输出目录
)
trainer = GRPOTrainer(
model = model,
processing_class = tokenizer,
reward_funcs = [
xmlcount_reward_func,
soft_format_reward_func,
strict_format_reward_func,
int_reward_func,
correctness_reward_func,
],
args = training_args,
train_dataset = dataset,
)
trainer.train()
# trainer.train(resume_from_checkpoint=True) 短点连接训练
model.save_lora("grpo_saved_lora")
model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
补充一下,在阿里云环境中,可以将conda环境加入jupyter,这样就可以一块块运行了
#切换到想添加的conda环境中
pip install ipykernel
python -m ipykernel install --user --name unsloth1 --display-name "Python (unsloth1)"
jupyter notebook
jupyter kernelspec list
中断训练后,继续检查点的训练(免费的阿里云云gpu资源8小时断一次),为代码增加下面的参数
trainer.train(resume_from_checkpoint=True)
终于训练完毕,24g显存,跑了不到12个小时,1000条数据
3.融合原来的大模型和微调后的lora参数
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# 加载基础模型
model = AutoModelForCausalLM.from_pretrained("Qwen2.5-3B-Instruct/Qwen/Qwen2.5-3B-Instruct", load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained("Qwen2.5-3B-Instruct/Qwen/Qwen2.5-3B-Instruct")
# 使用 PeftModel 加载 LoRA 适配器
model = PeftModel.from_pretrained(model, "grpo_saved_lora")
# 将 LoRA 适配器的权重合并到基础模型中,并卸载适配器
merged_model = model.merge_and_unload()
# 指定保存路径
save_directory = "./merged_model"
# 保存合并后的模型和分词器
merged_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
4.用融合后的大模型推理
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# 指定保存融合模型的目录
save_directory = "./merged_model"
# 加载融合后的模型和分词器
model = AutoModelForCausalLM.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)
# 将模型移动到 GPU(如果可用),否则使用 CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# 设置模型为评估模式
model.eval()
# 准备输入文本
input_text = "请输入您的输入文本"
# 使用分词器对输入文本进行编码,并将其移动到相同的设备
inputs = tokenizer(input_text, return_tensors="pt").to(device)
# 生成输出
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=50)
# 解码生成的输出
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("生成的文本:", generated_text)
5.进行测试集的测试
1)下面的是vllm形式运行,但是我这边会报错,你们可以试试
# test_math_model.py
import pandas as pd
import numpy as np
import re
from vllm import LLM, SamplingParams
from datasets import load_dataset
from transformers import AutoTokenizer
from datasets import Dataset
# 1. 配置参数
MODEL_PATH = "./merged_model" # 替换为你的模型路径
SYSTEM_PROMPT = """
按照以下格式进行回复用户
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
def extract_hash_answer(text:str) -> str:
if "####" not in text:
return None
return text.split("####")[1].strip()
# 2. 数据加载
def load_gsm8k_data():
dataset = load_dataset("openai/gsm8k", "main", split="test")
return dataset.map(lambda x: {
'question': x['question'],
'answer': extract_gsm8k_answer(x['answer'])
})
def get_gsm8k_questions(dataset_path: str) -> Dataset:
# 从本地 `.arrow` 文件加载数据
data = Dataset.from_file(dataset_path)
# 随机打乱数据,并选取前 1000 条数据(如果数据不够 1000 条,则取全部)
data = data.shuffle(seed=42).select(range(min(1000, len(data))))
# 处理 `question` 和 `answer` 字段
data = data.map(lambda x: {
'prompt': [
{'role': 'system', 'content': SYSTEM_PROMPT},
{'role': 'user', 'content': x['question']}
],
'answer': extract_hash_answer(x['answer'])
})
return data
# 运行
def extract_gsm8k_answer(text: str) -> str:
"""从GSM8K标准答案中提取最终答案"""
return text.split("#### ")[-1].strip().replace(",", "")
# 3. 模型加载类
class MathModelTester:
def __init__(self):
try:
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_PATH,
trust_remote_code=True
)
self.llm = LLM(
model=MODEL_PATH,
tensor_parallel_size=1,
trust_remote_code=True,
enforce_eager=True,
dtype="auto"
)
self.sampling_params = SamplingParams(
temperature=0.1,
max_tokens=512,
stop_token_ids=[self.tokenizer.eos_token_id]
)
print("\n=== 模型加载成功 ===")
except Exception as e:
print(f"模型加载失败: {str(e)}")
raise
def create_prompt(self, question):
return f"""<|im_start|>system
{SYSTEM_PROMPT}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""
def predict(self, questions):
try:
prompts = [self.create_prompt(q) for q in questions]
outputs = self.llm.generate(prompts, self.sampling_params)
return [self._parse_output(o.outputs[0].text) for o in outputs]
except Exception as e:
print(f"预测失败: {str(e)}")
raise
def _parse_output(self, text):
"""从模型输出中提取答案"""
# 宽松解析逻辑,处理可能的格式错误
answer_section = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
if not answer_section:
return ""
# 清理答案内容
answer = answer_section.group(1).strip()
answer = re.sub(r"[^0-9.]", "", answer) # 移除非数字字符
return answer.split()[-1] if answer else "" # 取最后出现的数字
# 4. 评估函数
def evaluate(predictions, references):
correct = 0
for pred, ref in zip(predictions, references):
# 统一格式:去除小数点后无效的零,处理科学计数法
try:
pred_normalized = "{:.2f}".format(float(pred)).rstrip('0').rstrip('.')
ref_normalized = "{:.2f}".format(float(ref)).rstrip('0').rstrip('.')
except:
pred_normalized = pred
ref_normalized = ref
if pred_normalized == ref_normalized:
correct += 1
accuracy = correct / len(predictions)
return {"accuracy": accuracy, "correct": correct, "total": len(predictions)}
# 5. 主测试流程
if __name__ == "__main__":
# 加载数据
print("=== 加载测试数据 ===")
# dataset = load_gsm8k_data()
dataset_path = "main/0.0.0/e53f048856ff4f594e959d75785d2c2d37b678ee/gsm8k-test.arrow"
dataset = get_gsm8k_questions(dataset_path)
print(f"加载样本数: {len(dataset)}")
# 初始化模型
tester = MathModelTester()
# 分批预测(防止内存溢出)
batch_size = 32
all_preds = []
for i in range(0, len(dataset), batch_size):
batch = dataset[i:i+batch_size]
preds = tester.predict(batch['question'])
all_preds.extend(preds)
print(f"已处理 {min(i+batch_size, len(dataset))}/{len(dataset)} 样本")
# 评估结果
results = evaluate(all_preds, dataset['answer'])
# 打印详细结果
print("\n=== 测试结果 ===")
print(f"正确数: {results['correct']}/{results['total']}")
print(f"准确率: {results['accuracy']:.4f}")
# 保存错误案例
error_cases = []
for i, (pred, ref) in enumerate(zip(all_preds, dataset['answer'])):
if pred != ref:
error_cases.append({
"question": dataset[i]['question'],
"prediction": pred,
"reference": ref
})
pd.DataFrame(error_cases).to_csv("error_analysis.csv", index=False)
print("错误分析已保存到 error_analysis.csv")
2)用transformers和torch加载大模型
import pandas as pd
import numpy as np
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from datasets import Dataset
# 1. 配置参数
MODEL_PATH = "./merged_model" # 替换为你的模型路径
SYSTEM_PROMPT = """
按照以下格式进行回复用户
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
def extract_hash_answer(text: str) -> str:
if "####" not in text:
return None
return text.split("####")[1].strip()
# 2. 数据加载
def load_gsm8k_data():
dataset = load_dataset("openai/gsm8k", "main", split="test")
return dataset.map(lambda x: {
'question': x['question'],
'answer': extract_gsm8k_answer(x['answer'])
})
def get_gsm8k_questions(dataset_path: str) -> Dataset:
# 从本地 `.arrow` 文件加载数据
data = Dataset.from_file(dataset_path)
# 随机打乱数据,并选取前 1000 条数据(如果数据不够 1000 条,则取全部)
data = data.shuffle(seed=42).select(range(min(1000, len(data))))
# 处理 `question` 和 `answer` 字段
data = data.map(lambda x: {
'prompt': [
{'role': 'system', 'content': SYSTEM_PROMPT},
{'role': 'user', 'content': x['question']}
],
'answer': extract_hash_answer(x['answer'])
})
return data
# 运行
def extract_gsm8k_answer(text: str) -> str:
"""从GSM8K标准答案中提取最终答案"""
return text.split("#### ")[-1].strip().replace(",", "")
# 3. 模型加载类
class MathModelTester:
def __init__(self):
try:
# 使用 transformers 加载模型
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
self.model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
# 将模型移动到 GPU(如果可用),否则使用 CPU
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
# 设置模型为评估模式
self.model.eval()
print("\n=== 模型加载成功 ===")
except Exception as e:
print(f"模型加载失败: {str(e)}")
raise
def create_prompt(self, question):
return f"""<|im_start|>system
{SYSTEM_PROMPT}
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""
def predict(self, questions):
try:
prompts = [self.create_prompt(q) for q in questions]
inputs = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(self.device)
# 使用模型进行生成
with torch.no_grad():
outputs = self.model.generate(**inputs, max_new_tokens=512)
# 解码生成的输出
decoded_outputs = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
return [self._parse_output(o) for o in decoded_outputs]
except Exception as e:
print(f"预测失败: {str(e)}")
raise
def _parse_output(self, text):
"""从模型输出中提取答案"""
# 宽松解析逻辑,处理可能的格式错误
answer_section = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
if not answer_section:
return ""
# 清理答案内容
answer = answer_section.group(1).strip()
answer = re.sub(r"[^0-9.]", "", answer) # 移除非数字字符
return answer.split()[-1] if answer else "" # 取最后出现的数字
# 4. 评估函数
def evaluate(predictions, references):
correct = 0
for pred, ref in zip(predictions, references):
# 统一格式:去除小数点后无效的零,处理科学计数法
try:
pred_normalized = "{:.2f}".format(float(pred)).rstrip('0').rstrip('.')
ref_normalized = "{:.2f}".format(float(ref)).rstrip('0').rstrip('.')
except:
pred_normalized = pred
ref_normalized = ref
if pred_normalized == ref_normalized:
correct += 1
accuracy = correct / len(predictions)
return {"accuracy": accuracy, "correct": correct, "total": len(predictions)}
# 5. 主测试流程
if __name__ == "__main__":
# 加载数据
print("=== 加载测试数据 ===")
# dataset = load_gsm8k_data()
dataset_path = "main/0.0.0/e53f048856ff4f594e959d75785d2c2d37b678ee/gsm8k-test.arrow"
dataset = get_gsm8k_questions(dataset_path)
print(f"加载样本数: {len(dataset)}")
# 初始化模型
tester = MathModelTester()
# 分批预测(防止内存溢出)
batch_size = 32
all_preds = []
for i in range(0, len(dataset), batch_size):
batch = dataset[i:i+batch_size]
preds = tester.predict(batch['question'])
all_preds.extend(preds)
print(f"已处理 {min(i+batch_size, len(dataset))}/{len(dataset)} 样本")
# 评估结果
results = evaluate(all_preds, dataset['answer'])
# 打印详细结果
print("\n=== 测试结果 ===")
print(f"正确数: {results['correct']}/{results['total']}")
print(f"准确率: {results['accuracy']:.4f}")
# 保存错误案例
error_cases = []
for i, (pred, ref) in enumerate(zip(all_preds, dataset['answer'])):
if pred != ref:
error_cases.append({
"question": dataset[i]['question'],
"prediction": pred,
"reference": ref
})
pd.DataFrame(error_cases).to_csv("error_analysis.csv", index=False)
print("错误分析已保存到 error_analysis.csv")
6.用vllm运行融合后的模型,依旧失败
from vllm import LLM
def main():
# 使用本地模型路径加载模型
llm = LLM(model="merged_model") # 替换为您实际的模型名称或路径
prompts = ["你好,今天天气怎么样?", "请介绍一下Python编程语言。"] # 示例提示
outputs = llm.generate(prompts)
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Generated text: {output.outputs[0].text}")
if __name__ == '__main__':
main()
7.用vllm运行微调之前的模型
from vllm import LLM
def main():
# 使用本地模型路径加载模型
llm = LLM(model="Qwen2.5-3B-Instruct/Qwen/Qwen2.5-3B-Instruct") # 替换为您实际的模型名称或路径
prompts = ["你好,今天天气怎么样?", "请介绍一下Python编程语言。"] # 示例提示
outputs = llm.generate(prompts)
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Generated text: {output.outputs[0].text}")
if __name__ == '__main__':
main()