训练大模型
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
# 1. 加载 Qwen2.5-0.5B 预训练模型和分词器
model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 指定多分类任务的类别数(请根据你的数据集修改)
num_labels = 5 # 假设有 5 个类别
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
label_mapping = {"positive": 0, "negative": 1, "neutral": 2}
# 2. 加载和预处理数据
def preprocess_function(examples):
tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized["label"] = label_mapping[examples["label"]] # 转换文本标签为数值
return tokenized
# 示例数据集(请替换为你的数据路径)
raw_datasets = DatasetDict({
"train": load_dataset("csv", data_files="train.csv")["train"],
"test": load_dataset("csv", data_files="test.csv")["train"]
})
# 预处理数据
tokenized_datasets = raw_datasets.map(preprocess