1.导包:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
print(sys.version_info)
for module in mpl, np, pd, sklearn, torch:
print(module.__name__, module.__version__)
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
2.准备数据:
from tensorflow import keras
#用karas有的数据集imdb,电影分类,分电影是积极的,还是消极的
imdb = keras.datasets.imdb
#载入数据使用下面两个参数
vocab_size = 10000 #词典大小,仅保留训练数据中前10000个最经常出现的单词,低频单词被舍弃
index_from = 3 #0,1,2,3空出来做别的事
#前一万个词出现词频最高的会保留下来进行处理,后面的作为特殊字符处理,
# 小于3的id都是特殊字符,下面代码有写
# 需要注意的一点是取出来的词表还是从1开始的,需要做处理
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
num_words = vocab_size, index_from = index_from)
num_words=vocab_size
保留数据集中频率最高的前vocab_size
个单词(例如vocab_size=10000
),低频词会被替换为特殊字符
3.载入词表:
#载入词表,看下词表长度,词表就像英语字典
word_index = imdb.get_word_index()
print(len(word_index))
print(type(word_index))
#词表虽然有8万多,但是我们只载入了最高频的1万词!!!!
get_word_index()
是处理 IMDB 数据集时常用的辅助函数,主要用于获取单词到整数索引的映射字典
4.构造word2idx和idx2word:
word2idx = {word: idx + 3 for word, idx in word_index.items()}
word2idx.update({
"[PAD]": 0, # 填充 token
"[BOS]": 1, # begin of sentence
"[UNK]": 2, # 未知 token
"[EOS]": 3, # end of sentence
})
idx2word = {idx: word for word, idx in word2idx.items()}
核心目的是构建包含特殊符号的词汇表双向映射(单词 ↔ 索引)
word_index
来自 IMDB 的get_word_index()
,其原始索引范围为 1~88587- +3 的意图是预留前4个索引位置(0-3)给特殊符号,避免覆盖原始词汇
- 强制覆盖前4个索引(0-3)为自定义符号
- 通过反转
word2idx
的键值对,实现索引 → 单词的快速查找
5.选择max_length
用于分析IMDB影评数据集中文本长度的分布,并确定一个合适的序列截断/填充长度(MAX_LENGTH
)
# 选择 max_length
length_collect = {}
for text in train_data:
length = len(text)
length_collect[length] = length_collect.get(length, 0) + 1
MAX_LENGTH = 500
plt.bar(length_collect.keys(), length_collect.values())
plt.axvline(MAX_LENGTH, label="max length", c="gray", ls=":")
plt.legend()
plt.show()
- 对每条影评计算其序列长度(单词数量)
- 记录每个长度值的出现次数到字典
length_collect
中 - 示例:若长度为500的影评出现120次,则
length_collect[500] = 120
- 统一序列长度后,数据可存储为固定形状的张量(如
(25000, 500)
),便于高效批处理
6.构造Tokenizer
实现了一个自定义文本分词器(Tokenizer),用于将原始文本转换为数值索引序列(编码)并支持反向转换(解码)
class Tokenizer:
def __init__(self, word2idx, idx2word, max_length=500, pad_idx=0, bos_idx=1, eos_idx=3, unk_idx=2):
self.word2idx = word2idx
self.idx2word = idx2word
self.max_length = max_length
self.pad_idx = pad_idx
self.bos_idx = bos_idx
self.eos_idx = eos_idx
self.unk_idx = unk_idx
def encode(self, text_list, padding_first=False):
"""如果padding_first == True,则padding加载前面,否则加载后面"""
max_length = min(self.max_length, 2 + max([len(text) for text in text_list]))
indices_list = []
for text in text_list:
indices = [self.bos_idx] + [self.word2idx.get(word, self.unk_idx) for word in text[:max_length-2]] + [self.eos_idx] #直接切片取前max_length-2个单词,然后加上bos和eos
if padding_first:
indices = [self.pad_idx] * (max_length - len(indices)) + indices
else:
indices = indices + [self.pad_idx] * (max_length - len(indices))
indices_list.append(indices)
return torch.tensor(indices_list)
def decode(self, indices_list, remove_bos=True, remove_eos=True, remove_pad=True, split=False):
text_list = []
for indices in indices_list:
text = []
for index in indices:
word = self.idx2word.get(index, "[UNK]")
if remove_bos and word == "[BOS]":
continue
if remove_eos and word == "[EOS]":
break
if remove_pad and word == "[PAD]":
break
text.append(word)
text_list.append(" ".join(text) if not split else text)
return text_list
tokenizer = Tokenizer(word2idx=word2idx, idx2word=idx2word)
raw_text = ["hello world".split(), "tokenize text datas with batch".split(), "this is a test".split()]
indices = tokenizer.encode(raw_text, padding_first=True)
decode_text = tokenizer.decode(indices.tolist(), remove_bos=False, remove_eos=False, remove_pad=False)
print("raw text")
for raw in raw_text:
print(raw)
print("indices")
for index in indices:
print(index)
print("decode text")
for decode in decode_text:
print(decode)
7.数据集与DataLoader:
构建一个基于PyTorch的文本分类数据处理管道
from torch.utils.data import Dataset, DataLoader
class IMDBDataset(Dataset):
def __init__(self, data, labels, remain_length=True):
if remain_length:
self.data = tokenizer.decode(data, remove_bos=False, remove_eos=False, remove_pad=False)
else:
# 缩减一下数据
self.data = tokenizer.decode(data)
self.labels = labels
def __getitem__(self, index):
text = self.data[index]
label = self.labels[index]
return text, label
def __len__(self):
return len(self.data)
def collate_fct(batch):
text_list = [item[0].split() for item in batch]
label_list = [item[1] for item in batch]
# 这里使用 padding first
text_list = tokenizer.encode(text_list, padding_first=True).to(dtype=torch.int)
return text_list, torch.tensor(label_list).reshape(-1, 1).to(dtype=torch.float)
# 用RNN,缩短序列长度
train_ds = IMDBDataset(train_data, train_labels, remain_length=False)
test_ds = IMDBDataset(test_data, test_labels, remain_length=False)
自定义的collate函数是PyTorch数据管道的核心组件,主要实现文本序列的动态批处理。
这里padding选择在开头填充,可以配合RNN的时序特性,减少无效计算,因为梯度只传播到实际数据位置。 item[0]和[1]分别代表文本和标签。
8.使用PyTorch的DataLoader
构建训练和测试数据管道
batch_size = 128
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fct)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fct)
9.定义模型
class RNN(nn.Module):
def __init__(self, embedding_dim=16, hidden_dim=64, vocab_size=vocab_size, num_layers=1, bidirectional=False):
super().__init__()
self.embeding = nn.Embedding(vocab_size, embedding_dim)
self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=bidirectional) #bidirectional是双向的
self.layer = nn.Linear(hidden_dim * (2 if bidirectional else 1), hidden_dim)
self.fc = nn.Linear(hidden_dim, 1)
def forward(self, x):
# [bs, seq length]
x = self.embeding(x)
# [bs, seq length, embedding_dim] -> shape [bs,seq length,hidden_dim]
seq_output, final_hidden = self.rnn(x)
# print(f'seq_output.shape={seq_output.shape}') # [bs, seq length, hidden_dim]
# print(f'final_hidden.shape={final_hidden.shape}') #[1, bs, hidden_dim]
x = seq_output[:, -1, :]
# print(f'x.shape={x.shape}')
#把final_hidden去除轴size为1的,和x进行比较,里边元素是否相等
# assert torch.all(final_hidden.squeeze(0) == x)
# 取最后一个时间步的输出 (这也是为什么要设置padding_first=True的原因)
x = self.layer(x)
x = self.fc(x)
return x
print("{:=^80}".format(" 一层单向 RNN "))
for key, value in RNN().named_parameters():
print(f"{key:^40}paramerters num: {np.prod(value.shape)}")
RNN模型结构:嵌入层→RNN层→线性层→输出层
- init说明
- embedding_dim=16embedding_dim=16: 词向量维度
- hidden_dim=64hidden_dim=64: RNN隐藏层维度
- vocab_sizevocab_size: 词汇表大小(需外部传入)
- num_layers=1num_layers=1: RNN堆叠层数
- bidirectional=Falsebidirectional=False: 是否双向RNN
- 嵌入层:将离散词索引映射为连续向量 x∈Rembedding_dimx∈Rembedding_dim
- RNN层:处理序列特征,双向时隐藏层维度翻倍
- 线性层:实现维度变换 hidden_dim×2→hidden_dimhidden_dim×2→hidden_dim(双向时需要)
- 输出层:最终映射到标量输出(适用于二分类/回归)
10.评估函数
在禁用梯度计算的条件下,遍历数据集计算预测结果,并统计总体损失和分类准确率
from sklearn.metrics import accuracy_score
@torch.no_grad()
def evaluating(model, dataloader, loss_fct):
loss_list = []
pred_list = []
label_list = []
for datas, labels in dataloader:
datas = datas.to(device)
labels = labels.to(device)
# 前向计算
logits = model(datas)
loss = loss_fct(logits, labels) # 验证集损失
loss_list.append(loss.item())
# 二分类
preds = logits > 0
pred_list.extend(preds.cpu().numpy().tolist())
label_list.extend(labels.cpu().numpy().tolist())
acc = accuracy_score(label_list, pred_list)
return np.mean(loss_list), acc
11.TensorBoard可视化(不重要)
from torch.utils.tensorboard import SummaryWriter
class TensorBoardCallback:
def __init__(self, log_dir, flush_secs=10):
"""
Args:
log_dir (str): dir to write log.
flush_secs (int, optional): write to dsk each flush_secs seconds. Defaults to 10.
"""
self.writer = SummaryWriter(log_dir=log_dir, flush_secs=flush_secs)
def draw_model(self, model, input_shape):
self.writer.add_graph(model, input_to_model=torch.randn(input_shape))
def add_loss_scalars(self, step, loss, val_loss):
self.writer.add_scalars(
main_tag="training/loss",
tag_scalar_dict={"loss": loss, "val_loss": val_loss},
global_step=step,
)
def add_acc_scalars(self, step, acc, val_acc):
self.writer.add_scalars(
main_tag="training/accuracy",
tag_scalar_dict={"accuracy": acc, "val_accuracy": val_acc},
global_step=step,
)
def add_lr_scalars(self, step, learning_rate):
self.writer.add_scalars(
main_tag="training/learning_rate",
tag_scalar_dict={"learning_rate": learning_rate},
global_step=step,
)
def __call__(self, step, **kwargs):
# add loss
loss = kwargs.pop("loss", None)
val_loss = kwargs.pop("val_loss", None)
if loss is not None and val_loss is not None:
self.add_loss_scalars(step, loss, val_loss)
# add acc
acc = kwargs.pop("acc", None)
val_acc = kwargs.pop("val_acc", None)
if acc is not None and val_acc is not None:
self.add_acc_scalars(step, acc, val_acc)
# add lr
learning_rate = kwargs.pop("lr", None)
if learning_rate is not None:
self.add_lr_scalars(step, learning_rate)
12.保存最优模型
class SaveCheckpointsCallback:
def __init__(self, save_dir, save_step=5000, save_best_only=True):
"""
Save checkpoints each save_epoch epoch.
We save checkpoint by epoch in this implementation.
Usually, training scripts with pytorch evaluating model and save checkpoint by step.
Args:
save_dir (str): dir to save checkpoint
save_epoch (int, optional): the frequency to save checkpoint. Defaults to 1.
save_best_only (bool, optional): If True, only save the best model or save each model at every epoch.
"""
self.save_dir = save_dir
self.save_step = save_step
self.save_best_only = save_best_only
self.best_metrics = -1
# mkdir
if not os.path.exists(self.save_dir):
os.mkdir(self.save_dir)
def __call__(self, step, state_dict, metric=None):
if step % self.save_step > 0:
return
if self.save_best_only:
assert metric is not None
if metric >= self.best_metrics:
# save checkpoints
torch.save(state_dict, os.path.join(self.save_dir, "best.ckpt"))
# update best metrics
self.best_metrics = metric
else:
torch.save(state_dict, os.path.join(self.save_dir, f"{step}.ckpt"))
按指定训练步长间隔保存模型
13.早停
class EarlyStopCallback:
def __init__(self, patience=5, min_delta=0.01):
"""
Args:
patience (int, optional): Number of epochs with no improvement after which training will be stopped.. Defaults to 5.
min_delta (float, optional): Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute
change of less than min_delta, will count as no improvement. Defaults to 0.01.
"""
self.patience = patience
self.min_delta = min_delta
self.best_metric = -1
self.counter = 0
def __call__(self, metric):
if metric >= self.best_metric + self.min_delta:
# update best metric
self.best_metric = metric
# reset counter
self.counter = 0
else:
self.counter += 1
@property
def early_stop(self):
return self.counter >= self.patience
14.训练
##############################
# 训练函数定义
##############################
def training(
model, # 待训练的神经网络模型
train_loader, # 训练数据加载器
val_loader, # 验证数据加载器
epoch, # 总训练轮数
loss_fct, # 损失函数(如交叉熵损失)
optimizer, # 优化器(如Adam)
tensorboard_callback=None, # TensorBoard可视化回调
save_ckpt_callback=None, # 模型保存回调
early_stop_callback=None, # 早停机制回调
eval_step=500, # 评估间隔步数(每多少训练step做一次验证)
):
# 训练记录字典(保存训练/验证的loss和acc)
record_dict = {"train": [], "val": []}
global_step = 0 # 全局训练步数计数器
model.train() # 设置模型为训练模式
# 进度条初始化(总步数=epoch数 * 每epoch的batch数)
with tqdm(total=epoch * len(train_loader)) as pbar:
# 逐epoch训练
for epoch_id in range(epoch):
# 逐batch训练
for datas, labels in train_loader:
# 数据转移到设备(GPU/CPU)
datas = datas.to(device)
labels = labels.to(device)
# 梯度清零(防止梯度累积)
optimizer.zero_grad()
# 前向传播(模型推理)
logits = model(datas)
# 计算损失
loss = loss_fct(logits, labels)
# 反向传播(计算梯度)
loss.backward()
# 参数更新(优化器执行一步)
optimizer.step()
# 计算预测结果(适用于二分类)
preds = logits > 0
# 计算准确率
acc = accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
loss = loss.cpu().item()
# 记录训练指标
record_dict["train"].append({"loss": loss, "acc": acc, "step": global_step})
##############################
# 评估阶段(按指定间隔执行)
##############################
if global_step % eval_step == 0:
model.eval() # 切换为评估模式
# 执行验证集评估
val_loss, val_acc = evaluating(model, val_loader, loss_fct)
# 记录验证指标
record_dict["val"].append({"loss": val_loss, "acc": val_acc, "step": global_step})
model.train() # 恢复训练模式
# 回调函数处理
# 1. TensorBoard数据写入(可视化)
if tensorboard_callback is not None:
tensorboard_callback(
global_step,
loss=loss, val_loss=val_loss,
acc=acc, val_acc=val_acc,
lr=optimizer.param_groups[0]["lr"], # 当前学习率
)
# 2. 模型权重保存
if save_ckpt_callback is not None:
save_ckpt_callback(global_step, model.state_dict(), metric=val_acc)
# 3. 早停机制检查
if early_stop_callback is not None:
early_stop_callback(val_acc)
if early_stop_callback.early_stop:
print(f"Early stop at epoch {epoch_id} / global_step {global_step}")
return record_dict # 提前终止训练
# 更新全局步数
global_step += 1
# 更新进度条
pbar.update(1)
pbar.set_postfix({"epoch": epoch_id})
return record_dict # 返回完整训练记录
##############################
# 训练参数配置
##############################
epoch = 20 # 总训练轮数
# 模型初始化(基于RNN的双向架构)
model = RNN(bidirectional=True)
# 损失函数:带logits的二分类交叉熵(适用于sigmoid前的输出)
loss_fct = F.binary_cross_entropy_with_logits
# 优化器:Adam(学习率0.001)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
##############################
# 回调函数配置
##############################
# 1. TensorBoard回调(日志路径:runs/imdb-rnn)
if not os.path.exists("runs"):
os.mkdir("runs")
tensorboard_callback = TensorBoardCallback("runs/imdb-rnn")
# 2. 模型保存回调(保存路径:checkpoints/imdb-rnn)
# 配置:每个epoch保存(save_step=len(train_dl))且只保存最优模型
if not os.path.exists("checkpoints"):
os.makedirs("checkpoints")
save_ckpt_callback = SaveCheckpointsCallback(
"checkpoints/imdb-rnn",
save_step=len(train_dl),
save_best_only=True
)
# 3. 早停回调(连续10次验证指标无提升则停止训练)
early_stop_callback = EarlyStopCallback(patience=10)
##############################
# 执行训练
##############################
model = model.to(device) # 将模型迁移到指定设备(GPU/CPU)
record = training(
model,
train_dl,
test_dl,
epoch,
loss_fct,
optimizer,
tensorboard_callback=None, # 此处禁用了TensorBoard回调
save_ckpt_callback=save_ckpt_callback,
early_stop_callback=early_stop_callback,
eval_step=len(train_dl) # 每个epoch结束时执行验证
)
15.画图
#画线要注意的是损失是不一定在零到1之间的
def plot_learning_curves(record_dict, sample_step=500):
# build DataFrame
train_df = pd.DataFrame(record_dict["train"]).set_index("step").iloc[::sample_step]
val_df = pd.DataFrame(record_dict["val"]).set_index("step")
# plot
fig_num = len(train_df.columns)
fig, axs = plt.subplots(1, fig_num, figsize=(5 * fig_num, 5))
for idx, item in enumerate(train_df.columns):
axs[idx].plot(train_df.index, train_df[item], label=f"train_{item}")
axs[idx].plot(val_df.index, val_df[item], label=f"val_{item}")
axs[idx].grid()
axs[idx].legend()
# axs[idx].set_xticks(range(0, train_df.index[-1], 5000))
# axs[idx].set_xticklabels(map(lambda x: f"{int(x/1000)}k", range(0, train_df.index[-1], 5000)))
axs[idx].set_xlabel("step")
plt.show()
plot_learning_curves(record, sample_step=10) #横坐标是 steps
16.评估
# dataload for evaluating
# load checkpoints
model.load_state_dict(torch.load("checkpoints/imdb-rnn/best.ckpt",weights_only=True, map_location="cpu"))
model.eval()
loss, acc = evaluating(model, test_dl, loss_fct)
print(f"loss: {loss:.4f}\naccuracy: {acc:.4f}")