使用skorch实现Seq2Seq机器翻译模型
skorch 项目地址: https://gitcode.com/gh_mirrors/sko/skorch
本文将介绍如何使用skorch框架构建一个完整的Seq2Seq机器翻译模型,实现英语到法语的翻译任务。我们将基于PyTorch的Seq2Seq教程代码,但使用skorch进行更简洁高效的模型训练和管理。
环境准备与数据加载
首先需要准备以下环境:
- Python 3.x
- PyTorch
- skorch
- NumPy
- Matplotlib(用于可视化)
数据准备方面,我们需要:
- 创建data目录
- 下载英法平行语料库
- 解压并将文件重命名为data/eng-fra.txt
import random
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import sklearn
import skorch
from skorch.utils import params_for
from torch.autograd import Variable
数据处理模块
我们使用专门的数据处理模块来准备训练数据:
import data
input_lang, output_lang, pairs = data.prepareData('eng', 'fra', True)
数据处理完成后,我们可以查看随机选择的句子对:
print(random.choice(pairs))
# 输出示例:['je suis ravie .', 'i m thrilled .']
模型架构设计
编码器(Encoder)
编码器将输入序列编码为固定维度的上下文向量:
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1):
super(EncoderRNN, self).__init__()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output = embedded
for i in range(self.n_layers):
output, hidden = self.gru(output, hidden)
return output, hidden
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size)
带注意力机制的解码器(Decoder)
解码器使用注意力机制生成目标语言序列:
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=data.MAX_LENGTH):
super(AttnDecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout_p = dropout_p
self.max_length = max_length
self.embedding = nn.Embedding(self.output_size, self.hidden_size)
self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
def forward(self, input, hidden, encoder_output, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
attn_weights = F.softmax(
self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=-1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0),
encoder_outputs.unsqueeze(0))
output = torch.cat((embedded[0], attn_applied[0]), 1)
output = self.attn_combine(output).unsqueeze(0)
for i in range(self.n_layers):
output = F.relu(output)
output, hidden = self.gru(output, hidden)
output = F.log_softmax(self.out(output[0]), dim=-1)
return output, hidden, attn_weights
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size)
Seq2Seq整合模型
将编码器和解码器整合为一个完整的Seq2Seq模型:
class Seq2Seq(nn.Module):
def __init__(
self,
encoder,
decoder,
teacher_forcing_ratio=0.5,
hidden_size=256,
max_length=data.MAX_LENGTH,
**kwargs
):
super().__init__()
self.encoder = encoder(hidden_size=hidden_size, **params_for('encoder', kwargs))
self.decoder = decoder(hidden_size=hidden_size, **params_for('decoder', kwargs))
self.max_length = max_length
self.teacher_forcing_ratio = teacher_forcing_ratio
def forward(self, x, y=None):
# 编码阶段
encoder_hidden = self.encoder.initHidden().to(x.device)
encoder_outputs = torch.zeros(self.max_length, self.encoder.hidden_size).to(x.device)
for ei in range(x.size(1)):
encoder_output, encoder_hidden = self.encoder(x[0, ei], encoder_hidden)
encoder_outputs[ei] = encoder_output[0][0]
# 解码阶段
use_teacher_forcing = y is not None and random.random() < self.teacher_forcing_ratio
ys = []
target_length = self.max_length if y is None else y.size(1)
target_variable = y
decoder_input = torch.LongTensor([[data.SOS_token]]).to(x)
decoder_hidden = encoder_hidden
if use_teacher_forcing:
# 使用教师强制训练
for di in range(target_length):
decoder_output, decoder_hidden, decoder_attention = self.decoder(
decoder_input, decoder_hidden, encoder_output, encoder_outputs)
ys.append(decoder_output)
decoder_input = target_variable[0, di]
else:
# 不使用教师强制
for di in range(target_length):
decoder_output, decoder_hidden, decoder_attention = self.decoder(
decoder_input, decoder_hidden, encoder_output, encoder_outputs)
topv, topi = decoder_output.data.topk(1)
ni = topi[0][0]
decoder_input = torch.LongTensor([[ni]]).to(x)
ys.append(decoder_output)
if ni == data.EOS_token:
break
return torch.stack(ys, dim=1)
训练流程实现
辅助函数
def indexesFromSentence(lang, sentence):
return [lang.word2index[word] for word in sentence.split(' ')]
def variableFromSentence(lang, sentence):
indexes = indexesFromSentence(lang, sentence)
indexes.append(data.EOS_token)
result = Variable(torch.LongTensor(indexes).view(-1, 1))
return result.to('cuda' if use_cuda else 'cpu')
def variablesFromPair(pair):
input_variable = variableFromSentence(input_lang, pair[0])
target_variable = variableFromSentence(output_lang, pair[1])
return (input_variable, target_variable)
自定义Trainer类
我们扩展skorch的NeuralNet类来实现训练逻辑:
class Trainer(skorch.NeuralNet):
def __init__(
self,
*args,
optimizer_encoder=torch.optim.SGD,
optimizer_decoder=torch.optim.SGD,
**kwargs
):
self.optimizer_encoder = optimizer_encoder
self.optimizer_decoder = optimizer_decoder
super().__init__(*args, **kwargs)
def initialize_optimizer(self):
pgroups, kwargs = self.get_params_for_optimizer(
'optimizer_encoder', self.module_.encoder.parameters())
self.optimizer_encoder_ = self.optimizer_encoder(*pgroups, **kwargs)
pgroups, kwargs = self.get_params_for_optimizer(
'optimizer_decoder', self.module_.decoder.parameters())
self.optimizer_decoder_ = self.optimizer_decoder(*pgroups, **kwargs)
return self
def train_step(self, Xi, yi):
self.module_.train()
self.optimizer_encoder_.zero_grad()
self.optimizer_decoder_.zero_grad()
y_pred = self.infer(Xi, yi)
loss = self.get_loss(y_pred, yi, X=Xi, training=True)
loss.backward()
self.optimizer_encoder_.step()
self.optimizer_decoder_.step()
return {'loss': loss, 'y_pred': y_pred}
def infer(self, Xi, yi=None):
Xi = skorch.utils.to_tensor(Xi, device=self.device)
yi = skorch.utils.to_tensor(yi, device=self.device) if yi is not None else None
return self.module_(Xi, yi)
def get_loss(self, y_pred, y_true, **kwargs):
y_true = y_true[:, :y_pred.size(1)]
y_pred_flat = y_pred.view(y_pred.size(0) * y_pred.size(1), -1)
y_true_flat = y_true.view(y_true.size(0) * y_true.size(1))
return super().get_loss(
y_pred_flat,
y_true_flat,
**kwargs)
def _predict(self, X, most_probable=True):
y_probas = []
for yp in self.forward_iter(X, training=False):
if most_probable:
pad = np.zeros((yp.size(0), data.MAX_LENGTH))
pad[:, :yp.size(1)] = skorch.utils.to_numpy(yp.max(-1)[-1])
else:
pad = np.zeros((yp.size(0), data.MAX_LENGTH, yp.size(-1)))
pad[:, :yp.size(1)] = skorch.utils.to_numpy(yp)
y_probas.append(pad)
y_proba = np.concatenate(y_probas, 0)
return y_proba
def predict_proba(self, X):
return self._predict(X, most_probable=False)
def predict(self, X):
return self._predict(X, most_probable=True)
def score(self, X, y):
y_pred = self.predict(X)
y_true = y_pred.copy()
for i, yi in enumerate(y):
yi = skorch.utils.to_numpy(yi.squeeze())
y_true[:, :len(yi)] = yi
return sklearn.metrics.accuracy_score(y_true.flatten(), y_pred.flatten())
模型训练与评估
训练配置
n_iters = 7500 # 原始教程使用75000,这里减少以加快训练
use_cuda = True # 使用GPU加速
training_pairs = [variablesFromPair(pairs[i]) for i in range(n_iters)]
初始化Trainer
trainer = Trainer(
criterion=torch.nn.NLLLoss,
# 为编码器和解码器分别配置优化器
optimizer_encoder=torch.optim.SGD,
optimizer_encoder__lr=0.01,
optimizer_decoder=torch.optim.SGD,
optimizer_decoder__lr=0.01,
module=Seq2Seq,
module__hidden_size=256,
module__encoder=EncoderRNN,
module__encoder__input_size=input_lang.n_words,
module__decoder=AttnDecoderRNN,
module__decoder__output_size=output_lang.n_words,
module__decoder__dropout_p=0.1,
# 不使用内部验证集
train_split=None,
# 批处理设置为False
batch_size=1,
iterator_train__shuffle=True,
device='cuda' if use_cuda else 'cpu',
max_epochs=10
)
开始训练
trainer.fit(training_pairs, y=None)
模型评估与预测
训练完成后,我们可以使用模型进行翻译预测:
def evaluate(sentence):
input_variable = variableFromSentence(input_lang, sentence)
output_words = trainer.predict(input_variable.unsqueeze(0))[0]
return ' '.join([output_lang.index2word[idx] for idx in output_words if idx not in [data.SOS_token, data.EOS_token]])
# 示例翻译
print(evaluate("I am happy."))
总结
本文详细介绍了如何使用skorch框架构建一个完整的Seq2Seq机器翻译模型。相比原始PyTorch实现,skorch提供了更简洁的API和更高效的训练流程管理。主要优势包括:
- 更简洁的训练循环实现
- 内置的模型保存和加载功能
- 与scikit-learn兼容的API设计
- 更方便的超参数调优支持
通过这个示例,我们可以看到skorch如何简化深度学习模型的训练流程,同时保持PyTorch的灵活性。这种组合特别适合需要快速原型开发和实验的研究场景。
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考