import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
import os
# 设置本地数据集路径
data_path = "C:\sdxx"
# 加载数据
train_iter, test_iter = IMDB(root=data_path, split=('train', 'test'))
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe
import matplotlib.pyplot as plt
from torch.utils.data.dataset import random_split
import numpy as np
import time
from torch.utils.data import DataLoader
import os
from sklearn.metrics import confusion_matrix, classification_report
# 禁用警告信息
import warnings
warnings.filterwarnings('ignore')
import torchtext
torchtext.disable_torchtext_deprecation_warning()
# 设置随机种子,保证结果可复现
torch.manual_seed(42)
np.random.seed(42)
# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 1. 数据预处理
tokenizer = get_tokenizer('basic_english')
train_iter = IMDB(split='train')
# 检查数据集格式
first_example = next(iter(train_iter))
print(f"数据集格式示例: {first_example}")
# 构建词汇表
def yield_tokens(data_iter):
for _, text in data_iter:
yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_iter), min_freq=5, specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])
# 加载预训练词向量 (300d)
print("加载预训练GloVe词向量...")
vectors = GloVe(name='6B', dim=300)
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
print(f"预训练词向量形状: {pretrained_embedding.shape}")
# 超参数 - 已优化配置
MAX_SEQ_LEN = 256 # 增加最大序列长度
EMBEDDING_DIM = 300
HIDDEN_DIM = 192 # 增加隐藏层维度
OUTPUT_DIM = 1
BATCH_SIZE = 32 # 减小批次大小增加随机性
EPOCHS = 15 # 增加最大训练轮次
LR = 5e-4 # 降低初始学习率
DROPOUT = 0.6 # 增加Dropout比率
accumulation_steps = 4 # 梯度累积步数
LSTM_LAYERS = 3 # 增加LSTM层数
WEIGHT_DECAY = 5e-4 # 增加权重衰减
# 数据批量处理函数 - 动态长度+标签处理
def collate_batch(batch):
# 标签处理:将标签1映射为0(负例),标签2映射为1(正例)
labels = [1 if example[0] == 2 else 0 for example in batch]
labels = torch.tensor(labels).to(device)
# 处理文本 - 动态长度
texts = [tokenizer(example[1]) for example in batch]
# 计算当前批次平均长度,动态调整截断值
avg_len = sum(len(t) for t in texts) // len(texts)
dynamic_len = min(avg_len + 25, MAX_SEQ_LEN) # 增加动态长度上限
texts = [vocab(text) for text in texts]
texts = [torch.tensor([i for i in text[:dynamic_len]]) for text in texts]
texts = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"]).to(device)
return texts, labels
# 重新加载训练集和测试集
train_iter = IMDB(split='train')
test_iter = IMDB(split='test')
# 使用原始数据(若需数据增强,可取消注释以下部分)
train_dataset = list(train_iter)
test_dataset = list(test_iter)
# 划分训练集和验证集
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
# 检查数据分布
def check_data_distribution():
train_labels = [label for (label, _) in train_dataset]
val_labels = [label for (label, _) in val_dataset]
test_labels = [label for (label, _) in test_dataset]
# 计算正例比例
train_pos_ratio = sum([1 for lbl in train_labels if lbl == 2]) / len(train_labels)
val_pos_ratio = sum([1 for lbl in val_labels if lbl == 2]) / len(val_labels)
test_pos_ratio = sum([1 for lbl in test_labels if lbl == 2]) / len(test_labels)
print(f"训练集大小: {len(train_dataset)}, 正例比例: {train_pos_ratio:.4f}")
print(f"验证集大小: {len(val_dataset)}, 正例比例: {val_pos_ratio:.4f}")
print(f"测试集大小: {len(test_dataset)}, 正例比例: {test_pos_ratio:.4f}")
# 检查批次数据
batch = next(iter(train_loader))
texts, labels = batch
print(f"批次示例 - 文本形状: {texts.shape}, 标签形状: {labels.shape}")
print(f"标签示例: {labels[:10].cpu().numpy()}")
check_data_distribution()
# 2. 改进模型结构 - 双向LSTM+注意力机制
class AttentionLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.5, pretrained=None, num_layers=2):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
# 使用预训练词向量
if pretrained is not None:
self.embedding.weight.data.copy_(pretrained)
self.embedding.weight.requires_grad = True # 微调预训练向量
self.lstm = nn.LSTM(
embedding_dim,
hidden_dim,
batch_first=True,
bidirectional=True, # 双向LSTM
num_layers=num_layers, # 增加LSTM层数
dropout=dropout # 层间dropout
)
# 注意力机制
self.attention = nn.Sequential(
nn.Linear(hidden_dim * 2, hidden_dim),
nn.Tanh(),
nn.Linear(hidden_dim, 1),
nn.Softmax(dim=1)
)
self.dropout = nn.Dropout(dropout)
# 添加额外的全连接层
self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# text: [batch_size, seq_len]
embedded = self.embedding(text) # [batch_size, seq_len, embedding_dim]
# LSTM输出
lstm_out, _ = self.lstm(embedded) # [batch_size, seq_len, hidden_dim*2]
# 注意力权重计算
attn_weights = self.attention(lstm_out) # [batch_size, seq_len, 1]
# 加权求和得到上下文向量
context = torch.bmm(attn_weights.transpose(1, 2), lstm_out).squeeze(1) # [batch_size, hidden_dim*2]
# 多层全连接
output = self.dropout(self.fc1(context))
output = self.fc2(output) # [batch_size, output_dim]
return output
# 3. 训练配置 - 优化器与调度器
def train_model(model_id=0):
# 初始化模型
model = AttentionLSTM(
vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, pretrained_embedding, LSTM_LAYERS
).to(device)
# 优化器 - 使用调整后的权重衰减
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# 学习率调度器 - 改进版
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(total_steps * 0.15) # 增加warmup比例
def warmup_cosine(current_step):
if current_step < warmup_steps:
return float(current_step) / float(max(1, warmup_steps))
progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
return max(0.0, 0.5 * (1.0 + np.cos(np.pi * progress)))
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=warmup_cosine)
# 早停机制 - 更严格的配置
early_stopping = EarlyStopping(patience=7, delta=0.005, verbose=True, path=f'best_model_{model_id}.pt')
# 训练循环
train_losses = []
val_losses = []
val_accs = []
print(f"\n开始训练模型 {model_id + 1}...")
start_time = time.time()
for epoch in range(EPOCHS):
# 训练阶段
model.train()
train_loss = 0.0
train_correct = 0
train_total = 0
for i, (texts, labels) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(texts).squeeze(1)
loss = criterion(outputs, labels.float())
loss = loss / accumulation_steps # 梯度累积
loss.backward()
# 梯度裁剪(防止梯度爆炸)
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# 累积一定步数后更新参数
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
scheduler.step() # 更新学习率
train_loss += loss.item() * accumulation_steps
preds = torch.round(torch.sigmoid(outputs))
train_total += labels.size(0)
train_correct += (preds == labels).sum().item()
# 计算训练准确率
train_acc = train_correct / train_total
# 验证阶段
model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
with torch.no_grad():
for texts, labels in val_loader:
outputs = model(texts).squeeze(1)
loss = criterion(outputs, labels.float())
val_loss += loss.item()
preds = torch.round(torch.sigmoid(outputs))
val_total += labels.size(0)
val_correct += (preds == labels).sum().item()
# 计算平均损失和准确率
avg_train_loss = train_loss / len(train_loader)
avg_val_loss = val_loss / len(val_loader)
val_acc = val_correct / val_total
# 记录结果
train_losses.append(avg_train_loss)
val_losses.append(avg_val_loss)
val_accs.append(val_acc)
# 打印当前学习率
current_lr = optimizer.param_groups[0]['lr']
print(f"Epoch: {epoch + 1}/{EPOCHS}, LR: {current_lr:.7f}, "
f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}, "
f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}")
# 早停判断
early_stopping(avg_val_loss, model)
if early_stopping.early_stop:
print("Early stopping")
break
total_time = time.time() - start_time
print(f"模型 {model_id + 1} 训练完成,总耗时: {total_time:.2f}秒")
return {
'model': model,
'train_losses': train_losses,
'val_losses': val_losses,
'val_accs': val_accs,
'best_val_loss': early_stopping.val_loss_min,
'best_val_acc': max(val_accs)
}
# 早停机制
class EarlyStopping:
def __init__(self, patience=5, verbose=False, delta=0.0, path='best_model.pt'):
self.patience = patience
self.verbose = verbose
self.counter = 0
self.best_score = None
self.early_stop = False
self.val_loss_min = np.Inf
self.delta = delta
self.path = path
def __call__(self, val_loss, model):
score = -val_loss
if self.best_score is None:
self.best_score = score
self.save_checkpoint(val_loss, model)
elif score < self.best_score + self.delta:
self.counter += 1
print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.save_checkpoint(val_loss, model)
self.counter = 0
def save_checkpoint(self, val_loss, model):
if self.verbose:
print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...')
torch.save(model.state_dict(), self.path)
self.val_loss_min = val_loss
# 集成模型预测
def ensemble_predict(models, texts):
probs = []
for model in models:
model.eval()
with torch.no_grad():
output = model(texts).squeeze(1)
probs.append(torch.sigmoid(output))
# 平均概率
avg_prob = torch.mean(torch.stack(probs), dim=0)
return torch.round(avg_prob)
# 训练多个模型进行集成
criterion = nn.BCEWithLogitsLoss()
num_models = 3 # 集成3个模型
models_info = []
for i in range(num_models):
model_info = train_model(i)
models_info.append(model_info)
# 加载最佳模型并在测试集上评估
ensemble_models = []
for i in range(num_models):
model = AttentionLSTM(
vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, pretrained_embedding, LSTM_LAYERS
).to(device)
model.load_state_dict(torch.load(f'best_model_{i}.pt'))
ensemble_models.append(model)
# 测试集评估
test_correct = 0
test_total = 0
all_preds = []
all_labels = []
with torch.no_grad():
for texts, labels in test_loader:
preds = ensemble_predict(ensemble_models, texts)
test_total += labels.size(0)
test_correct += (preds == labels).sum().item()
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
test_accuracy = test_correct / test_total
print(f"\n集成模型最终测试准确率: {test_accuracy:.4f}")
# 5. 可视化
plt.figure(figsize=(18, 5))
# 损失曲线
plt.subplot(1, 3, 1)
for i, info in enumerate(models_info):
plt.plot(range(1, len(info['train_losses']) + 1), info['train_losses'], f'{i + 1}-', label=f'Train Loss {i + 1}')
plt.plot(range(1, len(info['val_losses']) + 1), info['val_losses'], f'{i + 1}--', label=f'Val Loss {i + 1}')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
# 准确率曲线
plt.subplot(1, 3, 2)
for i, info in enumerate(models_info):
plt.plot(range(1, len(info['val_accs']) + 1), info['val_accs'], f'{i + 1}-', label=f'Val Acc {i + 1}')
plt.axhline(y=test_accuracy, color='purple', linestyle='--', label=f'Test Accuracy: {test_accuracy:.4f}')
plt.title('Validation and Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
# 混淆矩阵
plt.subplot(1, 3, 3)
cm = confusion_matrix(all_labels, all_preds)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
classes = ['Negative', 'Positive']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
# 在混淆矩阵上标注数字
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(j, i, format(cm[i, j], 'd'),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.savefig('training_metrics.png') # 保存图表
plt.show()
# 打印分类报告
report = classification_report(all_labels, all_preds, target_names=['Negative', 'Positive'])
print("\n分类报告:")
print(report)
# 单个样本测试
def predict_sentiment(models, vocab, tokenizer, sentence, max_len=MAX_SEQ_LEN):
tokens = tokenizer(sentence)
indexed = [vocab[t] for t in tokens]
tensor = torch.tensor(indexed).unsqueeze(0).to(device) # [1, seq_len]
# 动态长度处理
dynamic_len = min(len(indexed) + 25, max_len)
tensor = torch.nn.utils.rnn.pad_sequence([tensor.squeeze(0)], batch_first=True,
padding_value=vocab["<pad>"]).to(device)
# 集成预测
probs = []
for model in models:
model.eval()
with torch.no_grad():
output = model(tensor).squeeze(1)
probs.append(torch.sigmoid(output).item())
avg_prob = sum(probs) / len(probs)
return "Positive" if avg_prob > 0.5 else "Negative", avg_prob
# 测试示例
test_sentences = [
"This movie is amazing! I love it.",
"Terrible film, waste of money and time.",
"The plot was confusing but the visuals were stunning.",
"I've seen better, but it wasn't terrible.",
"Absolutely fantastic - one of the best I've seen!",
"The acting was good, but the story had no depth."
]
print("\n样本测试结果:")
for sentence in test_sentences:
sentiment, prob = predict_sentiment(ensemble_models, vocab, tokenizer, sentence)
print(f"影评: {sentence}")
print(f"情感: {sentiment}, 置信度: {prob:.4f}")
print("-" * 50)
# 保存模型和词汇表
for i, model in enumerate(ensemble_models):
torch.save(model.state_dict(), f'ensemble_model_{i}.pt')
torch.save(vocab, 'vocab.pth')
print("\n模型和词汇表已保存!")
# 预测自定义影评函数
def predict_custom_review(models):
print("\n输入影评文本 (输入'q'退出):")
while True:
review = input("> ")
if review.lower() == 'q':
break
sentiment, prob = predict_sentiment(models, vocab, tokenizer, review)
print(f"情感: {sentiment}, 置信度: {prob:.4f}")
print("-" * 50)
# 启动交互式预测
predict_custom_review(ensemble_models)
这段代码报错:C:\anaconda\envs\py39\python.exe C:\Users\32398\PyCharmMiscProject\深度学习\期末.py
C:\anaconda\envs\py39\lib\site-packages\torchtext\datasets\__init__.py:4: UserWarning:
/!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\
Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`
warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)
C:\anaconda\envs\py39\lib\site-packages\torchtext\data\__init__.py:4: UserWarning:
/!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\
Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`
warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)
C:\anaconda\envs\py39\lib\site-packages\torchdata\datapipes\__init__.py:18: UserWarning:
################################################################################
WARNING!
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################
deprecation_warning()
C:\anaconda\envs\py39\lib\site-packages\torchtext\vocab\__init__.py:4: UserWarning:
/!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\
Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`
warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)
C:\anaconda\envs\py39\lib\site-packages\torchtext\utils.py:4: UserWarning:
/!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\
Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`
warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)
ImportError: DLL load failed while importing _multiarray_umath: 找不到指定的模块。
Traceback (most recent call last):
File "C:\Users\32398\PyCharmMiscProject\深度学习\期末.py", line 13, in <module>
import matplotlib.pyplot as plt
File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\__init__.py", line 161, in <module>
from . import _api, _version, cbook, _docstring, rcsetup
File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\rcsetup.py", line 27, in <module>
from matplotlib.colors import Colormap, is_color_like
File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\colors.py", line 56, in <module>
from matplotlib import _api, _cm, cbook, scale
File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\scale.py", line 22, in <module>
from matplotlib.ticker import (
File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\ticker.py", line 138, in <module>
from matplotlib import transforms as mtransforms
File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\transforms.py", line 49, in <module>
from matplotlib._path import (
ImportError: numpy.core.multiarray failed to import
进程已结束,退出代码为 1