import os
import pandas as pd
import numpy as np
import torch
from transformers import (
BertTokenizer,
BertForSequenceClassification,
Trainer,
TrainingArguments,
logging as hf_logging
)
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import warnings
# 禁用不必要的警告
warnings.filterwarnings("ignore")
hf_logging.set_verbosity_error()
# 配置全局参数
CONFIG = {
"model_name": "bert-base-uncased",
"cache_dir": "./hf_cache", # 模型缓存目录
"max_length": 128,
"batch_size": 32,
"gradient_accumulation_steps": 2, # 梯度累积
"num_epochs": 3,
"proxy": None, # 例如 "http://127.0.0.1:1080"
"local_files_only": False # 是否强制使用本地文件
}
class RobustCommentDataset(Dataset):
def __init__(self, texts, labels, weights):
self.texts = texts
self.labels = labels
self.weights = weights
# 初始化tokenizer(带重试机制)
self.tokenizer = self._init_tokenizer()
self.max_len = CONFIG["max_length"]
def _init_tokenizer(self, retries=3):
for i in range(retries):
try:
return BertTokenizer.from_pretrained(
CONFIG["model_name"],
cache_dir=CONFIG["cache_dir"],
local_files_only=CONFIG["local_files_only"]
)
except Exception as e:
if i == retries - 1:
raise RuntimeError(f"Failed to load tokenizer after {retries} attempts: {str(e)}")
print(f"Tokenizer loading failed, retrying ({i+1}/{retries})...")
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
encoding = self.tokenizer(
text,
max_length=self.max_len,
padding="max_length",
truncation=True,
return_tensors="pt"
)
return {
"input_ids": encoding["input_ids"].flatten(),
"attention_mask": encoding["attention_mask"].flatten(),
"labels": torch.tensor(self.labels[idx], dtype=torch.long),
"weights": torch.tensor(self.weights[idx], dtype=torch.float)
}
def safe_preprocess_data(file_path):
# 使用迭代器读取大文件
chunk_iter = pd.read_csv(file_path, chunksize=50000)
dfs = []
for chunk in tqdm(chunk_iter, desc="Processing data"):
# 清洗数据
chunk = chunk.dropna(subset=["RATING"])
chunk["label"] = chunk["RATING"].apply(
lambda x: 2 if x >=4 else 1 if x ==3 else 0
)
chunk["weight"] = chunk["VOTES"] + 1
dfs.append(chunk)
full_df = pd.concat(dfs)
# 平衡数据集
train_df, test_df = train_test_split(
full_df,
test_size=0.2,
stratify=full_df["label"],
random_state=42
)
return train_df, test_df
class WeightedTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.get("labels")
weights = inputs.get("weights")
outputs = model(**inputs)
logits = outputs.logits
loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
weighted_loss = (loss * weights).mean()
return (weighted_loss, outputs) if return_outputs else weighted_loss
def load_model_with_retry():
for i in range(3):
try:
return BertForSequenceClassification.from_pretrained(
CONFIG["model_name"],
num_labels=3,
problem_type="single_label_classification",
cache_dir=CONFIG["cache_dir"],
local_files_only=CONFIG["local_files_only"]
)
except Exception as e:
if i == 2:
print("""
[Error] 模型加载失败,请尝试:
1. 手动下载模型: git clone https://huggingface.co/bert-base-uncased
2. 设置 local_files_only=True
3. 检查网络连接或代理设置
""")
raise
print(f"模型加载失败,重试中 ({i+1}/3)...")
def configure_training():
return TrainingArguments(
output_dir="./results",
num_train_epochs=CONFIG["num_epochs"],
per_device_train_batch_size=CONFIG["batch_size"],
per_device_eval_batch_size=CONFIG["batch_size"],
gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
warmup_ratio=0.1,
weight_decay=0.01,
logging_dir="./logs",
fp16=True,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
report_to="none"
)
def main():
# 配置代理
if CONFIG["proxy"]:
os.environ["HTTP_PROXY"] = CONFIG["proxy"]
os.environ["HTTPS_PROXY"] = CONFIG["proxy"]
# 数据预处理
train_df, test_df = safe_preprocess_data("D:\\BaiduNetdiskDownload\\电影数据集-CSV格式\\comments.csv")
# 创建数据集
train_dataset = RobustCommentDataset(
train_df["CONTENT"].values,
train_df["label"].values,
train_df["weight"].values
)
test_dataset = RobustCommentDataset(
test_df["CONTENT"].values,
test_df["label"].values,
test_df["weight"].values
)
# 加载模型
model = load_model_with_retry()
# 训练配置
training_args = configure_training()
# 初始化训练器
trainer = WeightedTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
compute_metrics=lambda p: {
"accuracy": (np.argmax(p.predictions, axis=1) == p.label_ids).mean()
}
)
# 开始训练
try:
trainer.train()
except KeyboardInterrupt:
print("\n训练被中断,正在保存当前进度...")
trainer.save_model("./interrupted_model")
# 保存最终模型
trainer.save_model("./sentiment_model")
trainer.tokenizer.save_pretrained("./sentiment_model")
if __name__ == "__main__":
main()
我需要训练过程损失值和正确率的变化