import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
设备配置
device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)
print(f"Using device: {device}")
数据预处理(保持不变)
def load_and_preprocess_data():
comments = pd.read_csv(‘D:\BaiduNetdiskDownload\电影数据集-CSV格式\comments.csv’)
if 'RATING' not in comments.columns:
raise KeyError("RATING column not found")
comments['CONTENT'] = comments['CONTENT'].fillna('').astype(str)
comments['CLEAN_CONTENT'] = comments['CONTENT'].apply(
lambda x: re.sub(r'[^\w\s]', '', x.lower())
)
comments['LABEL'] = comments['RATING'] - 1
valid_labels = comments['LABEL'].between(0, 4)
comments = comments[valid_labels].copy()
comments['LABEL'] = comments['LABEL'].astype(np.int32)
comments['WEIGHT'] = np.log1p(comments['VOTES']) + 1
return comments[['CLEAN_CONTENT', 'LABEL', 'WEIGHT']]
自定义数据集(保持不变)
class SentimentDataset(Dataset):
def init(self, sequences, labels, weights):
self.sequences = torch.LongTensor(sequences)
self.labels = torch.LongTensor(labels)
self.weights = torch.FloatTensor(weights)
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
return self.sequences[idx], self.labels[idx], self.weights[idx]
LSTM模型(保持不变)
class BiLSTMSentiment(nn.Module):
def init(self, vocab_size, embedding_dim=128, hidden_dim=128):
super().init()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim,
bidirectional=True, batch_first=True)
self.dropout = nn.Dropout(0.3)
self.fc = nn.Sequential(
nn.Linear(hidden_dim*2, 64),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(64, 5)
)
def forward(self, x):
x = self.embedding(x)
out, (hn, cn) = self.lstm(x)
hn = torch.cat((hn[-2], hn[-1]), dim=1)
return self.fc(self.dropout(hn))
修改后的训练函数(添加早停和准确率跟踪)
def train_model(model, train_loader, val_loader, optimizer, epochs=10):
history = {‘train_loss’: [], ‘train_acc’: [], ‘val_loss’: [], ‘val_acc’: []}
criterion = nn.CrossEntropyLoss(reduction=‘none’)
best_acc = 0.0
early_stopping_counter = 0
patience = 3 # 早停耐心值
for epoch in range(epochs):
# 训练阶段
model.train()
epoch_loss = 0.0
epoch_correct = 0
total_samples = 0
progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}', leave=False)
for seq, labels, weights in progress_bar:
seq, labels, weights = seq.to(device), labels.to(device), weights.to(device)
optimizer.zero_grad()
outputs = model(seq)
loss = (criterion(outputs, labels) * weights).mean()
loss.backward()
optimizer.step()
# 计算训练准确率
preds = outputs.argmax(dim=1)
correct = (preds == labels).sum().item()
epoch_correct += correct
epoch_loss += loss.item() * seq.size(0)
total_samples += seq.size(0)
progress_bar.set_postfix({
'loss': loss.item(),
'acc': f"{correct/seq.size(0):.2f}"
})
# 验证阶段
val_acc, val_loss = evaluate(model, val_loader)
train_loss = epoch_loss / total_samples
train_acc = epoch_correct / total_samples
# 记录历史数据
history['train_loss'].append(train_loss)
history['train_acc'].append(train_acc)
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
# 打印训练结果
print(f"\nEpoch {epoch+1} Summary:")
print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
# 早停机制实现
if val_acc > best_acc:
best_acc = val_acc
early_stopping_counter = 0
torch.save(model.state_dict(), 'best_model.pth')
print(f"🚀 New best model saved with accuracy: {best_acc:.4f}")
else:
early_stopping_counter += 1
print(f"⏳ Early stopping counter: {early_stopping_counter}/{patience}")
# 早停条件检查
if early_stopping_counter >= patience:
print(f"⛔ Early stopping triggered after {epoch+1} epochs!")
break
# 可视化训练过程
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history['train_loss'], label='Train')
plt.plot(history['val_loss'], label='Validation')
plt.title('Loss Curve')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history['train_acc'], label='Train')
plt.plot(history['val_acc'], label='Validation')
plt.title('Accuracy Curve')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.savefig('training_curves.png')
plt.close()
return model, history
评估函数(保持不变)
def evaluate(model, loader):
model.eval()
total_loss, total_correct = 0, 0
criterion = nn.CrossEntropyLoss()
with torch.no_grad():
for seq, labels, _ in loader:
seq, labels = seq.to(device), labels.to(device)
outputs = model(seq)
loss = criterion(outputs, labels)
total_loss += loss.item() * seq.size(0)
preds = outputs.argmax(dim=1)
total_correct += (preds == labels).sum().item()
avg_loss = total_loss / len(loader.dataset)
accuracy = total_correct / len(loader.dataset)
return accuracy, avg_loss
if name == “main”:
# 数据准备流程(保持不变)
data = load_and_preprocess_data()
# 文本向量化
tokenizer = Tokenizer(num_words=50000, oov_token="<OOV>")
tokenizer.fit_on_texts(data['CLEAN_CONTENT'])
sequences = pad_sequences(
tokenizer.texts_to_sequences(data['CLEAN_CONTENT']),
maxlen=200, padding='post', truncating='post'
)
# 数据集划分
X_train, X_temp, y_train, y_temp, w_train, w_temp = train_test_split(
sequences, data['LABEL'].values, data['WEIGHT'].values,
test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test, w_val, w_test = train_test_split(
X_temp, y_temp, w_temp, test_size=0.5, random_state=42
)
# 数据加载器
train_loader = DataLoader(
SentimentDataset(X_train, y_train, w_train),
batch_size=512, shuffle=True
)
val_loader = DataLoader(
SentimentDataset(X_val, y_val, w_val),
batch_size=512
)
test_loader = DataLoader(
SentimentDataset(X_test, y_test, w_test),
batch_size=512
)
# 模型初始化
model = BiLSTMSentiment(vocab_size=50000).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# 训练流程
trained_model, history = train_model(
model, train_loader, val_loader, optimizer, epochs=15
)
# 最终测试
test_acc, test_loss = evaluate(trained_model, test_loader)
print(f"\n🎯 Final Test Results:")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")
# 模型保存
torch.save({
'model_state': trained_model.state_dict(),
'tokenizer_config': tokenizer.to_json(),
'history': history
}, 'sentiment_model.pth')
print("Model saved with training history")
请基于此代码,给出优化后的完整代码,不要对源代码进行删减修改,只是增加新的优化代码,给出完整代码
最新发布