基于PyTorch的LSTM情感分析实战教程
pytorch-sentiment-analysis 项目地址: https://gitcode.com/gh_mirrors/py/pytorch-sentiment-analysis
本教程将详细介绍如何使用PyTorch构建一个双向LSTM模型来完成IMDB电影评论的情感分析任务。我们将从数据预处理开始,逐步讲解模型构建、训练和评估的全过程。
环境准备与数据加载
首先需要导入必要的Python库:
import collections
import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
为了确保实验的可重复性,我们设置了随机种子:
seed = 1234
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
数据预处理
加载IMDB数据集
IMDB数据集包含50,000条电影评论,标记为正面或中性评价:
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])
文本分词与处理
使用基础英文分词器对文本进行处理:
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
def tokenize_example(example, tokenizer, max_length):
tokens = tokenizer(example["text"])[:max_length]
length = len(tokens)
return {"tokens": tokens, "length": length}
max_length = 256
train_data = train_data.map(tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length})
test_data = test_data.map(tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length})
数据集划分
将训练集进一步划分为训练集和验证集:
test_size = 0.25
train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]
构建词汇表
min_freq = 5
special_tokens = ["<unk>", "<pad>"]
vocab = torchtext.vocab.build_vocab_from_iterator(
train_data["tokens"],
min_freq=min_freq,
specials=special_tokens,
)
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]
vocab.set_default_index(unk_index)
文本数值化
将分词后的文本转换为数字索引:
def numericalize_example(example, vocab):
ids = vocab.lookup_indices(example["tokens"])
return {"ids": ids}
train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
数据加载器
创建数据加载器以便批量处理数据:
def get_collate_fn(pad_index):
def collate_fn(batch):
batch_ids = [i["ids"] for i in batch]
batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
batch_length = [i["length"] for i in batch]
batch_length = torch.stack(batch_length)
batch_label = [i["label"] for i in batch]
batch_label = torch.stack(batch_label)
batch = {"ids": batch_ids, "length": batch_length, "label": batch_label}
return batch
return collate_fn
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
collate_fn = get_collate_fn(pad_index)
data_loader = torch.utils.data.DataLoader(
dataset=dataset,
batch_size=batch_size,
collate_fn=collate_fn,
shuffle=shuffle,
)
return data_loader
batch_size = 512
train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)
模型构建
双向LSTM模型
我们构建一个双向LSTM模型来处理变长文本序列:
class LSTM(nn.Module):
def __init__(
self,
vocab_size,
embedding_dim,
hidden_dim,
output_dim,
n_layers,
bidirectional,
dropout_rate,
pad_index,
):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
self.lstm = nn.LSTM(
embedding_dim,
hidden_dim,
n_layers,
bidirectional=bidirectional,
dropout=dropout_rate,
batch_first=True,
)
self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout_rate)
def forward(self, ids, length):
embedded = self.dropout(self.embedding(ids))
packed_embedded = nn.utils.rnn.pack_padded_sequence(
embedded, length, batch_first=True, enforce_sorted=False
)
packed_output, (hidden, cell) = self.lstm(packed_embedded)
if self.lstm.bidirectional:
hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
else:
hidden = self.dropout(hidden[-1])
prediction = self.fc(hidden)
return prediction
模型参数初始化
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300
output_dim = len(train_data.unique("label"))
n_layers = 2
bidirectional = True
dropout_rate = 0.5
model = LSTM(
vocab_size,
embedding_dim,
hidden_dim,
output_dim,
n_layers,
bidirectional,
dropout_rate,
pad_index,
)
使用预训练词向量
vectors = torchtext.vocab.GloVe()
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
model.embedding.weight.data = pretrained_embedding
模型训练与评估
训练设置
lr = 5e-4
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = criterion.to(device)
训练与评估函数
def train(dataloader, model, criterion, optimizer, device):
model.train()
epoch_losses = []
epoch_accs = []
for batch in tqdm.tqdm(dataloader, desc="training..."):
ids = batch["ids"].to(device)
length = batch["length"]
label = batch["label"].to(device)
prediction = model(ids, length)
loss = criterion(prediction, label)
accuracy = get_accuracy(prediction, label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_losses.append(loss.item())
epoch_accs.append(accuracy.item())
return np.mean(epoch_losses), np.mean(epoch_accs)
def evaluate(dataloader, model, criterion, device):
model.eval()
epoch_losses = []
epoch_accs = []
with torch.no_grad():
for batch in tqdm.tqdm(dataloader, desc="evaluating..."):
ids = batch["ids"].to(device)
length = batch["length"]
label = batch["label"].to(device)
prediction = model(ids, length)
loss = criterion(prediction, label)
accuracy = get_accuracy(prediction, label)
epoch_losses.append(loss.item())
epoch_accs.append(accuracy.item())
return np.mean(epoch_losses), np.mean(epoch_accs)
def get_accuracy(prediction, label):
batch_size, _ = prediction.shape
predicted_classes = prediction.argmax(dim=-1)
correct_predictions = predicted_classes.eq(label).sum()
accuracy = correct_predictions / batch_size
return accuracy
训练过程
经过几轮训练后,模型在验证集上达到了约78%的准确率:
epoch: 0
train_loss: 0.619, train_acc: 0.645
valid_loss: 0.525, valid_acc: 0.738
epoch: 1
train_loss: 0.524, train_acc: 0.751
valid_loss: 0.735, valid_acc: 0.743
epoch: 2
train_loss: 0.501, train_acc: 0.760
valid_loss: 0.507, valid_acc: 0.783
总结
本教程展示了如何使用PyTorch构建一个双向LSTM模型进行情感分析任务。关键点包括:
- 使用pack_padded_sequence处理变长序列,提高计算效率
- 采用双向LSTM捕获文本的上下文信息
- 使用预训练词向量提升模型性能
- 实现了完整的训练和评估流程
通过这个实例,读者可以掌握基于RNN的文本分类任务的基本流程和方法。
pytorch-sentiment-analysis 项目地址: https://gitcode.com/gh_mirrors/py/pytorch-sentiment-analysis
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考