根据上一对话建议修改以下代码:
import subprocess
import sys
# 安装或升级必要的库
try:
import accelerate
if accelerate.__version__ < "0.26.0":
raise ImportError
except ImportError:
print("安装 accelerate 库...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "accelerate>=0.26.0"])
# 确保安装后重新导入
import accelerate
import pandas as pd
import torch
import numpy as np
import os
import logging
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import Dataset, DataLoader
# 设置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler("model_training.log"),
logging.StreamHandler()
]
)
# 直接使用本地模型路径
MODEL_PATH = "./models"
logging.info(f"使用本地模型路径: {MODEL_PATH}")
logging.info(f"模型目录内容: {os.listdir(MODEL_PATH)}")
# 配置参数
BATCH_SIZE = 8
MAX_LENGTH = 256 # 适当减少长度以节省内存
NUM_EPOCHS = 3
FEATURE_COLUMNS = [
"是否水文",
"性别",
"先锋/中庸/保守",
"感性/理性",
"悦己/顾家/事业型",
"优雅/粗犷",
"高速/城乡/越野",
"智能科技/功能实用"
]
# 1. 数据预处理
class CarReviewDataset(Dataset):
def __init__(self, texts, labels=None):
self.texts = texts
self.labels = labels
# 从本地加载分词器
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
logging.info("分词器加载成功")
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
row = self.texts[idx] # 这里texts是一个字典列表,每个元素是一行数据
# 拼接关键字段形成完整文本
structured_text = (
f"标题:{row['标题']};"
f"最满意:{row['最满意']};"
f"最不满意:{row['最不满意']};"
f"性价比评价:{row['性价比']};"
f"配置功能评价:{row['配置功能']};"
f"空间评价:{row['空间']};"
f"外观评价:{row['外观']};"
f"内饰评价:{row['内饰']};"
f"驾驶感受评价:{row['驾驶感受']};"
f"油耗评价:{row['油耗']}"
)
encoding = self.tokenizer(
structured_text,
max_length=MAX_LENGTH,
truncation=True,
padding="max_length",
return_tensors="pt"
)
item = {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten()
}
if self.labels is not None:
for feature in FEATURE_COLUMNS:
# 确保标签存在
if feature in self.labels and idx < len(self.labels[feature]):
item[feature] = torch.tensor(self.labels[feature][idx], dtype=torch.long)
else:
# 对于缺失标签,使用默认值
default_value = 0
if feature == "是否水文" or feature == "性别":
default_value = 0
else:
default_value = 5 # 中性值
item[feature] = torch.tensor(default_value, dtype=torch.long)
return item
# 2. 多任务模型定义
class MultiTaskModel(torch.nn.Module):
def __init__(self, num_labels_dict):
super().__init__()
# 直接从本地加载模型
logging.info("正在加载基础模型...")
self.base_model = AutoModel.from_pretrained(MODEL_PATH, local_files_only=True)
logging.info("基础模型加载成功")
# 为每个特征构建分类头
self.heads = torch.nn.ModuleDict()
for feature, n_labels in num_labels_dict.items():
self.heads[feature] = torch.nn.Linear(
768, # bert-base-chinese隐藏层大小
n_labels
)
logging.info("分类头构建完成")
def forward(self, input_ids, attention_mask):
outputs = self.base_model(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=True
)
last_hidden_state = outputs.last_hidden_state
cls_embeddings = last_hidden_state[:, 0, :] # [CLS]向量
logits = {}
for feature, head in self.heads.items():
logits[feature] = head(cls_embeddings)
return logits
# 自定义Trainer处理多任务损失
class MultiTaskTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
# 提取标签
labels = {}
for feature in FEATURE_COLUMNS:
if feature in inputs:
labels[feature] = inputs.pop(feature)
# 前向传播
outputs = model(**inputs)
# 计算损失
total_loss = 0
for feature in FEATURE_COLUMNS:
if feature in labels:
criterion = torch.nn.CrossEntropyLoss()
loss = criterion(outputs[feature], labels[feature])
total_loss += loss
return (total_loss, outputs) if return_outputs else total_loss
# 3. 标签编码函数
def encode_rationality_1(label):
if pd.isna(label):
return 5
scale_map = {
"纯粹先锋": 0, "非常先锋": 1, "一般先锋": 2,
"先锋偏中庸": 3, "中庸偏先锋": 4, "中庸": 5,
"中庸偏保守": 6, "保守偏中庸": 7, "一般保守": 8,
"非常保守": 9, "纯粹保守": 10
}
return scale_map.get(label, 5) # 默认中性
def encode_rationality_2(label):
if pd.isna(label):
return 5
scale_map = {
"纯粹感性": 0, "非常感性": 1, "一般感性": 2,
"比较感性": 3, "偏感性": 4, "中性": 5,
"偏理性": 6, "比较理性": 7, "一般理性": 8,
"非常理性": 9, "纯粹理性": 10
}
return scale_map.get(label, 5) # 默认中性
def encode_rationality_3(label):
if pd.isna(label):
return 5
scale_map = {
"纯粹悦己": 0, "非常悦己": 1, "一般悦己": 2,
"悦己偏顾家": 3, "顾家偏悦己": 4, "顾家": 5,
"顾家偏事业型": 6, "事业型偏顾家": 7, "一般事业型": 8,
"非常事业型": 9, "纯粹事业型": 10
}
return scale_map.get(label, 5) # 默认中性
def encode_rationality_4(label):
if pd.isna(label):
return 5
scale_map = {
"纯粹优雅": 0, "非常优雅": 1, "一般优雅": 2,
"比较优雅": 3, "偏优雅": 4, "中性": 5,
"偏粗犷": 6, "比较粗犷": 7, "一般粗犷": 8,
"非常粗犷": 9, "纯粹粗犷": 10
}
return scale_map.get(label, 5) # 默认中性
def encode_rationality_5(label):
if pd.isna(label):
return 5
scale_map = {
"纯粹高速": 0, "高速占比很多": 1, "高速占比大": 2,
"高速城乡各一半": 3, "城乡占比多": 4, "存粹城乡": 5,
"少数时候越野": 6, "偶尔越野": 7, "较多时候越野": 8,
"经常越野": 9, "纯粹越野": 10
}
return scale_map.get(label, 5) # 默认中性
def encode_rationality_6(label):
if pd.isna(label):
return 5
scale_map = {
"纯粹智能科技": 0, "非常智能科技": 1, "一般智能科技": 2,
"比较智能科技": 3, "偏智能科技": 4, "中性": 5,
"偏功能实用": 6, "比较功能实用": 7, "一般功能实用": 8,
"非常功能实用": 9, "纯粹功能实用": 10
}
return scale_map.get(label, 5) # 默认中性
# 4. 数据准备
try:
logging.info("读取训练数据...")
labeled_df = pd.read_excel("训练数据.xlsx", sheet_name="Sheet1")
# 处理NaN值 - 安全处理
for col in labeled_df.columns:
if labeled_df[col].dtype == 'float64':
labeled_df[col] = labeled_df[col].fillna(0.0)
else:
labeled_df[col] = labeled_df[col].fillna('')
# 准备文本数据
texts = []
for _, row in labeled_df.iterrows():
text_dict = {
"标题": str(row["标题"]),
"最满意": str(row["最满意"]),
"最不满意": str(row["最不满意"]),
"性价比": str(row["性价比"]),
"配置功能": str(row["配置功能"]),
"空间": str(row["空间"]),
"外观": str(row["外观"]),
"内饰": str(row["内饰"]),
"驾驶感受": str(row["驾驶感受"]),
"油耗": str(row["油耗"])
}
texts.append(text_dict)
logging.info(f"训练数据样本数: {len(texts)}")
# 准备标签数据
labels = {
"是否水文": labeled_df["是否水文"].map(lambda x: 0 if x == "是" else 1).values,
"性别": labeled_df["性别"].map(lambda x: 0 if x == "男" else 1).values,
"先锋/中庸/保守": labeled_df["先锋/中庸/保守"].apply(encode_rationality_1).values,
"感性/理性": labeled_df["感性/理性"].apply(encode_rationality_2).values,
"悦己/顾家/事业型": labeled_df["悦己/顾家/事业型"].apply(encode_rationality_3).values,
"优雅/粗犷": labeled_df["优雅/粗犷"].apply(encode_rationality_4).values,
"高速/城乡/越野": labeled_df["高速/城乡/越野"].apply(encode_rationality_5).values,
"智能科技/功能实用": labeled_df["智能科技/功能实用"].apply(encode_rationality_6).values,
}
# 划分数据集
indices = np.arange(len(texts))
train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)
train_texts = [texts[i] for i in train_indices]
val_texts = [texts[i] for i in val_indices]
train_labels = {feature: labels[feature][train_indices] for feature in FEATURE_COLUMNS}
val_labels = {feature: labels[feature][val_indices] for feature in FEATURE_COLUMNS}
logging.info(f"训练集大小: {len(train_texts)}, 验证集大小: {len(val_texts)}")
except Exception as e:
logging.error(f"数据加载失败: {str(e)}")
raise
# 5. 训练配置
num_labels_dict = {
"是否水文": 2,
"性别": 2,
"先锋/中庸/保守": 11,
"感性/理性": 11,
"悦己/顾家/事业型": 11,
"优雅/粗犷": 11,
"高速/城乡/越野": 11,
"智能科技/功能实用": 11,
}
# 创建模型
logging.info("创建多任务模型...")
model = MultiTaskModel(num_labels_dict)
# 自定义评估指标
def compute_metrics(eval_pred):
logits_dict, labels_dict = eval_pred
metrics = {}
for feature in FEATURE_COLUMNS:
if feature in logits_dict and feature in labels_dict:
preds = np.argmax(logits_dict[feature], axis=1)
true_labels = labels_dict[feature]
# 确保预测和标签长度一致
if len(preds) != len(true_labels):
min_len = min(len(preds), len(true_labels))
preds = preds[:min_len]
true_labels = true_labels[:min_len]
acc = accuracy_score(true_labels, preds)
f1 = f1_score(true_labels, preds, average="weighted")
metrics[f"{feature}_accuracy"] = acc
metrics[f"{feature}_f1"] = f1
return metrics
# 训练参数 - 使用正确的参数名
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="steps", # 每N步评估一次
eval_steps=500, # 每500步评估
save_strategy="epoch", # 每轮保存
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=5,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
report_to="tensorboard"
)
# 创建Trainer
logging.info("创建Trainer...")
trainer = MultiTaskTrainer(
model=model,
args=training_args,
train_dataset=CarReviewDataset(train_texts, train_labels),
eval_dataset=CarReviewDataset(val_texts, val_labels),
compute_metrics=compute_metrics
)
# 6. 训练模型
logging.info("开始训练模型...")
trainer.train()
logging.info("模型训练完成!")
# 保存模型
model_save_path = "./saved_model"
os.makedirs(model_save_path, exist_ok=True)
trainer.save_model(model_save_path)
logging.info(f"模型已保存到: {model_save_path}")
# 7. 预测未标注数据
try:
logging.info("读取验证数据...")
unlabeled_df = pd.read_excel("验证数据.xlsx", sheet_name="Sheet1")
# 处理NaN值
for col in unlabeled_df.columns:
if unlabeled_df[col].dtype == 'float64':
unlabeled_df[col] = unlabeled_df[col].fillna(0.0)
else:
unlabeled_df[col] = unlabeled_df[col].fillna('')
# 准备文本数据
unlabeled_texts = []
for _, row in unlabeled_df.iterrows():
text_dict = {
"标题": str(row["标题"]),
"最满意": str(row["最满意"]),
"最不满意": str(row["最不满意"]),
"性价比": str(row["性价比"]),
"配置功能": str(row["配置功能"]),
"空间": str(row["空间"]),
"外观": str(row["外观"]),
"内饰": str(row["内饰"]),
"驾驶感受": str(row["驾驶感受"]),
"油耗": str(row["油耗"])
}
unlabeled_texts.append(text_dict)
logging.info(f"验证数据样本数: {len(unlabeled_texts)}")
# 创建预测数据集
predict_dataset = CarReviewDataset(unlabeled_texts)
predict_loader = DataLoader(predict_dataset, batch_size=16, shuffle=False)
# 使用训练好的模型进行预测
model.eval()
predictions = {feature: [] for feature in FEATURE_COLUMNS}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
logging.info(f"使用设备: {device}")
with torch.no_grad():
for batch in predict_loader:
inputs = {
'input_ids': batch['input_ids'].to(device),
'attention_mask': batch['attention_mask'].to(device)
}
logits = model(**inputs)
for feature in FEATURE_COLUMNS:
preds = torch.argmax(logits[feature], dim=1)
predictions[feature].extend(preds.cpu().numpy())
# 8. 结果保存
for feature in FEATURE_COLUMNS:
unlabeled_df[f"预测_{feature}"] = predictions[feature]
unlabeled_df.to_excel("标注结果.xlsx", index=False)
logging.info("预测结果已保存到: 标注结果.xlsx")
except Exception as e:
logging.error(f"预测过程中出错: {str(e)}")
raise
logging.info("程序执行完毕!")