文章目录
机器翻译
机器翻译和数据集
机器翻译不能直接用循环神经网络来实现,主要困难在于:输出是单词序列而不是单个单词, 输出序列的长度可能与源序列的长度不同。比如 3个 I am Chinese 得映射到 5个 我是中国人。
import os
os.listdir('/home/kesci/input/')
OUT:
[‘fraeng6506’, ‘d2l9528’, ‘d2l6239’]
import sys
sys.path.append('/home/kesci/input/d2l9528/')
import collections
import d2l
import zipfile
from d2l.data.base import Vocab
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch import optim
数据预处理
将数据集清洗、转化为神经网络的输入minibatch
with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
raw_text = f.read()
print(raw_text[0:1000])
OUT:
Go. Va ! CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)
Hi. Salut ! CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)
Hi. Salut. CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)
……
Who? Qui ? CC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #4366796 (gillux)
Wow! Ça alors ! CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #374631 (zmoo)
Fire! Au feu ! CC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #4627939 (sacredceltic)
一行即为一个样本结构,分析第一行:
Go. Va ! CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)
Go
+标点.
+Tab(看起来像空格,实际Tab),然后是法语单词Va
+空格+标点!
,后面的文字都是不需要的。
- 预处理第一步需要去乱码(line 2)
e.g.:Va
和!
之间的空格。这里不是一般的空格,因为字符在计算机里面是以编码的形式存在,我们通常所用的空格是 \x20 ,是在标准ASCII可见字符 0x20~0x7e 范围内。 而 \xa0 属于 latin1 (ISO/IEC_8859-1)中的扩展字符集字符,代表不间断空白符nbsp(non-breaking space),超出GBK编码范围,是需要去除的特殊字符。不清洗会报错。 - 大写 --> 小写(line 4)。否则
Go
和go
会被识别为不同单词 - 去标点(line 5),且标点前加个空格
def preprocess_raw(text):
text = text.replace('\u202f', ' ').replace('\xa0', ' ')
out = ''
for i, char in enumerate(text.lower()):
if char in (',', '!', '.') and i > 0 and text[i-1] != ' ':
out += ' '
out += char
return out
text = preprocess_raw(raw_text)
print(text[0:1000])
OUT:
go . va ! cc-by 2 .0 (france) attribution: tatoeba .org #2877272 (cm) & #1158250 (wittydev)
hi . salut ! cc-by 2 .0 (france) attribution: tatoeba .org #538123 (cm) & #509819 (aiji)
分词
字符串 --> 单词组成的列表
num_examples = 50000
source, target = [], [] # 分别代表英语和法语的两个列表
for i, line in enumerate(text.split('\n')): # \n区分每一行
if i > num_examples:
break
parts = line.split('\t') # \t区分每一个单词
if len(parts) >= 2:
source.append(parts[0].split(' '))
target.append(parts[1].split(' '))
source[0:3], target[0:3]
([[‘go’, ‘.’], [‘hi’, ‘.’], [‘hi’, ‘.’]],
[[‘va’, ‘!’], [‘salut’, ‘!’], [‘salut’, ‘.’]])
# 统计句长
d2l.set_figsize()
d2l.plt.hist([[len(l) for l in source], [len(l) for l in target]],label=['source', 'target'])
d2l.plt.legend(loc='upper right');
建立词典
单词组成的列表 --> 单词ID组成的列表。建立英语和法语词典
line 2 :收录数据集里出现的所有单词,取出所有单词,连成单词列表,不去重也不做其他任何处理。
下面代码中直接调用build_vocab函数,构造source的单词列表
def build_vocab(tokens):
tokens = [token for line in tokens for token in line]
return d2l.data.base.Vocab(tokens, min_freq=3, use_special_tokens=True)
src_vocab = build_vocab(source)
len(src_vocab)
3789
这里摘录了Vocab的一部分:(详细讲解见NLP 笔记 2.1的‘建立词典’部分)
<pad>
:补足句子的特殊token
<bos/eos>
:句子开始和结束的标志,也是用于添加到句子上
载入数据集
-
输入数据时,要保证每个batch里面的句子长度是一样,所以用padding_token进行pad
def pad(line, max_len, padding_token): if len(line) > max_len: return line[:max_len] return line + [padding_token] * (max_len - len(line)) pad(src_vocab[source[0]], 10, src_vocab.pad)
[38, 4, 0, 0, 0, 0, 0, 0, 0, 0] <<— 只有2个词: 38,4 ,用padding_token( =0) 补到10长度。
-
把
Go!
变成[38,4,0,0,0,0,0,0,0,0]
,这里需要调用Class vocab里面的__getitem__
函数,def build_array(lines, vocab, max_len, is_source): # 实例中source:英,target:法 lines = [vocab[line] for line in lines] if not is_source: lines = [[vocab.bos] + line + [vocab.eos] for line in lines] # +bos/eos,判断句子开始和结束 array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines]) valid_len = (array != vocab.pad).sum(1) #第一个维度 return array, valid_len
-
数据生成器,为神经网络送入一个batch的样本,每次都生成一组,不会全部生成完后才去做一个循环
def load_data_nmt(batch_size, max_len): # This function is saved in d2l. src_vocab, tgt_vocab = build_vocab(source), build_vocab(target) src_array, src_valid_len = build_array(source, src_vocab, max_len, True) tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False) # data.TensorDataset可判断句子是否一一对应,即第0维是否相等 train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len) train_iter = data.DataLoader(train_data, batch_size, shuffle=True) return src_vocab, tgt_vocab, train_iter
数据生成器测试:src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size=2, max_len=8) for X, X_valid_len, Y, Y_valid_len, in train_iter: print('X =', X.type(torch.int32), '\nValid lengths for X =', X_valid_len, '\nY =', Y.type(torch.int32), '\nValid lengths for Y =', Y_valid_len) break
X = tensor([[ 5, 24, 3, 4, 0, 0, 0, 0],
[ 12, 1388, 7, 3, 4, 0, 0, 0]], dtype=torch.int32)
Valid lengths for X = tensor([4, 5])
Y = tensor([[ 1, 23, 46, 3, 3, 4, 2, 0],
[ 1, 15, 137, 27, 4736, 4, 2, 0]], dtype=torch.int32)
Valid lengths for Y = tensor([7, 7])
Encoder-Decoder
之前提到机器翻译一个困难之处在于非等长翻译。e.g.: I am Chinese
-->> 我是中国人
是3个到5个,因此引入Encoder - Decoder,常应用于对话系统、生成式任务中。
encoder:输入到隐藏状态hidden state(语义编码即为这个隐藏状态)
decoder:隐藏状态到输出
class Encoder(nn.Module):
def __init__(self, **kwargs):
super(Encoder, self).__init__(**kwargs)
def forward(self, X, *args):
raise NotImplementedError
class Decoder(nn.Module):
def __init__(self, **kwargs):
super(Decoder, self).__init__(**kwargs)
def init_state(self, enc_outputs, *args):
raise NotImplementedError
def forward(self, X, state):
raise NotImplementedError
class EncoderDecoder(nn.Module):
def __init__(self, encoder, decoder, **kwargs):
super(EncoderDecoder, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
def forward(self, enc_X, dec_X, *args):
enc_outputs = self.encoder(enc_X, *args)
dec_state = self.decoder.init_state(enc_outputs, *args)
return self.decoder(dec_X, dec_state)
Sequence to Sequence模型
模型
训练过程:
之前提到过隐藏层的初始化,即
H
−
1
H_{-1}
H−1,我们一般初始化为0。但对于Decoder这个神经网络的初始化,即其
H
−
1
H_{-1}
H−1的值,应该初始化为hidden state。<bos>
作为第一个输入,<bos>
结合hidden state可以预测出第一个法语单词bonjour
。<bos>
+bonjour
+H1
(新的隐藏层)可以预测出le
,最终预测出<eos>
,一个句子结束。
然后用预测出来的句子和原来的法语句子进行比较,计算出loss。
预测过程:
我们只有hello world.
,并不知道其对应的法语句子是什么,所以需要把预测出来的bonjour
画个弯,绕下来,用到下一个单词的预测中(预测出le
)
具体结构
有这样一个id列表:[1,1,3,4],不能直接输入到神经网络中,我们会先把id=1的单词翻译成一个向量,e.g.: hello
的id=1,不能直接把1输入网络,而应该翻译成一个词向量(Embedding层),–>> 输入Recurrent layer(Encoder) -->> hidden state(语义编码) -->> Recurrent layer(Decoder) -->> Dense
Encoder 代码
class Seq2SeqEncoder(d2l.Encoder):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqEncoder, self).__init__(**kwargs)
self.num_hiddens=num_hiddens
self.num_layers=num_layers
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)
# 用的LSTM模型,对H初始的隐藏状态H_{-1}和记忆细胞都初始化,两个zeros
def begin_state(self, batch_size, device):
return [torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device),
torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device)]
def forward(self, X, *args):
X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size)
X = X.transpose(0, 1) # X shape: (seq_len, batch_size, embed_size)
# RNN needs first axes to be time。对RNN,时间是第一个维度,so换一换
# state = self.begin_state(X.shape[1], device=X.device)
out, state = self.rnn(X)
# The shape of out is (seq_len, batch_size, num_hiddens).
# state contains the hidden state and the memory cell
# of the last time step, the shape is (num_layers, batch_size, num_hiddens)
return out, state
示例:
embed_size=8: 每个单词用一个8维的向量表示
X = torch.zeros((4, 7):构造一个输入,4句话,7个单词
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7),dtype=torch.long)
output, state = encoder(X)
output.shape, len(state), state[0].shape, state[1].shape
(torch.Size([7, 4, 16]), 2, torch.Size([2, 4, 16]), torch.Size([2, 4, 16]))
Decoder 代码
self.embedding
: 和上面Encoder的embedding作用一样,只是这里是法语的embedding;
self.dense
:dense层的作用是
H
t
H_t
Ht映射到单词表(生成bonjour
),每一个循环神经单元有一个
H
t
H_t
Ht输出,通过全连接层dense层选出概率最大的单词。
def init_state( )
:此处的enc_outputs[1]
即为Encoder最后返回的state。Encoder输出的
C
t
,
H
t
C_t,H_t
Ct,Ht,作为Encoder的语义编码,同时
C
t
,
H
t
C_t,H_t
Ct,Ht也是作为Decoder的第一个LSTM单元的初始化输入;
class Seq2SeqDecoder(d2l.Decoder):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqDecoder, self).__init__(**kwargs)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)
self.dense = nn.Linear(num_hiddens,vocab_size)
def init_state(self, enc_outputs, *args):
return enc_outputs[1]
def forward(self, X, state):
X = self.embedding(X).transpose(0, 1)
out, state = self.rnn(X, state)
# Make the batch to be the first dimension to simplify loss computation.
out = self.dense(out).transpose(0, 1)
return out, state
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
state = decoder.init_state(encoder(X))
out, state = decoder(X, state)
out.shape, len(state), state[0].shape, state[1].shape
(torch.Size([4, 7, 10]), 2, torch.Size([2, 4, 16]), torch.Size([2, 4, 16]))
out.shape最后一维度=10,因为vocab_size=10
,即单词表的大小,最后这有10个得分,从中选择得分最高的单词作为输出。
损失函数
SequenceMask
:之前为了保证batch里每个句子长度一样,做了padding,例如长度本来为2的,padding到了10。这里计算损失函数时,后面padding部分的损失要弄成无效。
value=0
:无效部分填充为value,可设置为0 , -1等等
.to(X_len.device)
:torch.arange这个部分放到X_len同样的device上(如果使用GPU训练的话,对CPU来讲无所谓好像?)
def SequenceMask(X, X_len,value=0):
maxlen = X.size(1)
mask = torch.arange(maxlen)[None, :].to(X_len.device) < X_len[:, None]
X[~mask]=value
return X
实例1:
X = torch.tensor([[1,2,3], [4,5,6]])
SequenceMask(X,torch.tensor([1,2])) # 第一个样本只有1个有效,第二个只有2个有效
tensor([[1, 0, 0],
[4, 5, 0]])
实例2:
X = torch.ones((2,3, 4))
SequenceMask(X, torch.tensor([1,2]),value=-1)
tensor([[[ 1., 1., 1., 1.],
[-1., -1., -1., -1.],
[-1., -1., -1., -1.]],
[[ 1., 1., 1., 1.],
[ 1., 1., 1., 1.],
[-1., -1., -1., -1.]]])
交叉熵损失函数
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
# pred shape: (batch_size, seq_len, vocab_size)
# label shape: (batch_size, seq_len)
# valid_length shape: (batch_size, )
def forward(self, pred, label, valid_length):
# the sample weights shape should be (batch_size, seq_len)
weights = torch.ones_like(label)
weights = SequenceMask(weights, valid_length).float()
self.reduction='none'
output=super(MaskedSoftmaxCELoss, self).forward(pred.transpose(1,2), label)
return (output*weights).mean(dim=1)
实例:
loss = MaskedSoftmaxCELoss()
loss(torch.ones((3, 4, 10)), torch.ones((3,4),dtype=torch.long), torch.tensor([4,3,0]))
tensor([2.3026, 1.7269, 0.0000])
训练
def train_ch7(model, data_iter, lr, num_epochs, device): # Saved in d2l
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss = MaskedSoftmaxCELoss()
tic = time.time()
for epoch in range(1, num_epochs+1):
l_sum, num_tokens_sum = 0.0, 0.0 # 求loss时用l_sum/num_tokens_sum
for batch in data_iter:
optimizer.zero_grad()
X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch] # Y_vlen 有效长度
# Y_input=<bos>+words Decoder的输入,不需要eos ;Y_label=words+<eos>
# Y=<bos>+words+<eos> 所以Y_vlen = Y_vle - 1
Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen) # decoder模型预测出的Y
l = loss(Y_hat, Y_label, Y_vlen).sum()
l.backward()
with torch.no_grad(): # 梯度裁剪
d2l.grad_clipping_nn(model, 5, device)
num_tokens = Y_vlen.sum().item()
optimizer.step()
l_sum += l.sum().item()
num_tokens_sum += num_tokens
if epoch % 50 == 0:
print("epoch {0:4d},loss {1:.3f}, time {2:.1f} sec".format(
epoch, (l_sum/num_tokens_sum), time.time()-tic))
tic = time.time()
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_examples, max_len = 64, 1e3, 10
lr, num_epochs, ctx = 0.005, 300, d2l.try_gpu()
src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(
batch_size, max_len,num_examples)
encoder = Seq2SeqEncoder(
len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(
len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
train_ch7(model, train_iter, lr, num_epochs, ctx)
epoch 50,loss 0.093, time 38.2 sec
epoch 100,loss 0.046, time 37.9 sec
epoch 150,loss 0.032, time 36.8 sec
epoch 200,loss 0.027, time 37.5 sec
epoch 250,loss 0.026, time 37.8 sec
epoch 300,loss 0.025, time 37.3 sec
测试
def translate_ch7(model, src_sentence, src_vocab, tgt_vocab, max_len, device):
src_tokens = src_vocab[src_sentence.lower().split(' ')]
src_len = len(src_tokens)
if src_len < max_len:
src_tokens += [src_vocab.pad] * (max_len - src_len)
enc_X = torch.tensor(src_tokens, device=device)
enc_valid_length = torch.tensor([src_len], device=device)
# use expand_dim to add the batch_size dimension.
enc_outputs = model.encoder(enc_X.unsqueeze(dim=0), enc_valid_length)
dec_state = model.decoder.init_state(enc_outputs, enc_valid_length) # enc_outputs的Ht和Ct作为dec的初始状态
dec_X = torch.tensor([tgt_vocab.bos], device=device).unsqueeze(dim=0) # <bos>作为dec的第一个输入
predict_tokens = []
for _ in range(max_len):
Y, dec_state = model.decoder(dec_X, dec_state)
# The token with highest score is used as the next time step input.
dec_X = Y.argmax(dim=2)
py = dec_X.squeeze(dim=0).int().item() # 得到单词
if py == tgt_vocab.eos: # 如果单词=<eos>,句子结束
break
predict_tokens.append(py)
return ' '.join(tgt_vocab.to_tokens(predict_tokens))
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
print(sentence + ' => ' + translate_ch7(
model, sentence, src_vocab, tgt_vocab, max_len, ctx))
Go . => va !
Wow ! => !
I’m OK . => ça va .
I won ! => j’ai gagné !
Beam Search
简单greedy search:
每一时间步选出当前概率最高的单词作为下一个输入,问题在于只考虑局部最优解,未考虑前后单词的影响。
维特比算法:选择整体分数最高的句子。 把所有单词都试一遍,然后找一个整体分数比较高的句子,但是搜索空间太大。
改进–>>集束搜索,结合了维特比算法和greedy search:
使用beam size参数来限制在每一步保留下来的可能性词的数量