项目背景:本项目介绍了如何使用Qwen2.5进行指令微调以实现文本分类。微调是通过在数据集上训练来改善LLMs理解人类指令的能力。本项目基于Qwen2.5-3B-Instruct模型在“事件中心”数据集上进行微调,并借助SwanLab进行监控和可视化。环境要求Python 3.8+和Nvidia显卡。步骤包括安装所需库、准备数据、加载模型、配置训练可视化工具及运行完整代码。训练完成后,展示了一些示例以验证模型性能。
环境需要:
swanlab
modelscope
transformers
datasets
peft
accelerate
pandas
实现部署:
1、数据准备
删除脏数据和一级分类为“其他”的数据,然后保存为CSV文件。处理后数据如下实例:
2、加载模型
from modelscope import snapshot_download, AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
# Transformers加载模型权重
model_name = "/data/ai/wjh_team/Qwen2.5-3B-Instruct"
model =
AutoModelForCausalLM.from_pretrained(model_name,torch_dtype="auto",device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
3、配置训练可视化工具
使用SwanLab来监控整个训练过程,并评估最终的模型效果。需要去https://swanlab.cn上注册一个账号,在用户设置页面复制你的API Key,训练时需要用到。
4、模型训练
代码如下所示:
import json
import pandas as pd
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from modelscope import snapshot_download, AutoTokenizer
from swanlab.integration.huggingface import SwanLabCallback
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import os
import swanlab
def dataset_csv_transfer(origin_path, new_train_path, new_test_path):
"""
将CSV数据集转换为大模型微调所需数据格式的新数据集,并划分为训练集和测试集
"""
messages = []
# 读取CSV文件
df = pd.read_csv(origin_path)
# 将数据集划分为训练集和测试集
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# 处理训练集
for _, row in train_df.iterrows():
context = row["text"]
category = row["one_category"]
label = row["one_output"]
message = {
"instruction": "你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项,请输出文本内容的正确类型",
"input": f"文本:{context},类型选型:{category}",
"output": label,
}
messages.append(message)
# 保存重构后的训练集
with open(new_train_path, "w", encoding="utf-8") as file:
for message in messages:
file.write(json.dumps(message, ensure_ascii=False) + "\n")
# 重置消息列表用于测试集
messages = []
# 处理测试集
for _, row in test_df.iterrows():
context = row["text"]
category = row["one_category"]
label = row["one_output"]
message = {
"instruction": "你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项,请输出文本内容的正确类型",
"input": f"文本:{context},类型选型:{category}",
"output": label,
}
messages.append(message)
# 保存重构后的测试集
with open(new_test_path, "w", encoding="utf-8") as file:
for message in messages:
file.write(json.dumps(message, ensure_ascii=False) + "\n")
def process_func(example):
"""
将数据集进行预处理
"""
MAX_LENGTH = 384
input_ids, attention_mask, labels = [], [], []
instruction = tokenizer(
f"<|im_start|>system\n你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项,请输出文本内容的正确类型<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
add_special_tokens=False,
)
response = tokenizer(f"{example['output']}", add_special_tokens=False)
input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
attention_mask = (
instruction["attention_mask"] + response["attention_mask"] + [1]
)
labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
if len(input_ids) > MAX_LENGTH: # 做一个截断
input_ids = input_ids[:MAX_LENGTH]
attention_mask = attention_mask[:MAX_LENGTH]
labels = labels[:MAX_LENGTH]
return {
"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
def predict(messages, model, tokenizer):
device = "cuda"
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
return response
if __name__ == '__main__':
# 加载模型
model_name = "/data/ai/wjh_team/Qwen2.5-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.enable_input_require_grads() # 开启梯度检查点时,要执行该方法
# 加载、处理数据集
origin_csv_path = "/data/ai/wjh_team/incident/qw_train_data.csv"
# 保存为JSONL格式,以便后续使用
train_jsonl_new_path = "one_train.jsonl"
test_jsonl_new_path = "one_test.jsonl"
if not os.path.exists(train_jsonl_new_path) or not os.path.exists(test_jsonl_new_path):
dataset_csv_transfer(origin_csv_path, train_jsonl_new_path, test_jsonl_new_path)
# 得到训练集
train_df = pd.read_json(train_jsonl_new_path, lines=True)
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)
config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
inference_mode=False, # 训练模式
r=8, # Lora 秩
lora_alpha=32, # Lora alpha,具体作用参见 Lora 原理
lora_dropout=0.1, # Dropout 比例
)
model = get_peft_model(model, config)
args = TrainingArguments(
output_dir="/data/ai/wjh_team/Qwen2.5_output",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
logging_steps=10,
num_train_epochs=2,
save_steps=100,
learning_rate=1e-4,
save_on_each_node=True,
gradient_checkpointing=True,
report_to="none",
)
swanlab_callback = SwanLabCallback(
project="Qwen2.5-fintune",
experiment_name="Qwen2.5_output",
description="使用通义千问Qwen2.5-3B-Instruct模型在事件中心文本分类数据集上微调。(二级分类)",
config={
"model": model_name,
"dataset": origin_csv_path,
}
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
callbacks=[swanlab_callback],
)
trainer.train()
# 用测试集的前10条,测试模型
test_df = pd.read_json(test_jsonl_new_path, lines=True)[:10]
test_text_list = []
for index, row in test_df.iterrows():
instruction = row['instruction']
input_value = row['input']
messages = [
{
"role": "system", "content": f"{instruction}"},
{
"role": "user", "content": f"{input_value}"}
]
response = predict(messages, model, tokenizer)
messages.append({
"role": "assistant", "content": f"{response}"})
result_text = f"{messages[0]}\n\n{messages[1]}\n\n{messages[2]}"
test_text_list.append(swanlab.Text(result_text, caption=response))
swanlab.log({
"Prediction": test_text_list})
swanlab.finish()
出现下面界面代表训练开始。
约占16GB显卡内存。
5、训练结果演示
在SwanLab上查看最终的训练结果: