【人工智能概论】 Transformer模型实现
- 简单的完成了Transformer模型代码的实现,并且将参数进行了集中化管理。
一. Transformer的代码实现
import numpy as np
import torch
import torch.nn as nn
import math
import torch.nn.functional as F
def get_attn_pad_mask(seq_q, seq_k):
"""
# 得到句子中padding的位置信息,以便于在计算自注意力和交互注意力的时候去掉padding符号的影响
# 在自注意力机制部分中,计算出Q*K的转置除以根号d_k之后,softmax之前,会得到一个张量
# 其形状为[batch_size,len_q,len_k],代表单词间(Q、K)的影响力分数
# 而get_attn_pad_mask的结果提供了一个与之尺度相同的张量,助力定位padding位置
# 计算softmax之前会将它们置为无穷大,以达到消除padding影响的作用
"""
batch_size, len_q = seq_q.size()
batch_size, len_k = seq_k.size()
"""
K、Q不一定一致,比如交叉注意力中,Q来自于解码端,K来自编码端
实际上,本方法只对K中的padding符号进行标识,并没有对Q进行处理
或许,Q、K的角色不同,导致了这样的结果
"""
pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)
pad_attn_mask = pad_attn_mask.expand(batch_size, len_q, len_k)
return pad_attn_mask
class ScaledDotProductAttention(nn.Module):
def __init__(self):
super(ScaledDotProductAttention, self).__init__()
def forward(self, Q, K, V, attn_mask):
"""
Q:[batch_size, n_heads, len_q, d_k]
K:[batch_size, n_heads, len_k, d_k]
V:[batch_size, n_heads, len_k, d_v]
KQ的编码尺寸d_k相同,KV的长度相同,这都是潜在的信息
"""
d_k = K.size(-1)
scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)
scores.masked_fill_(attn_mask, -1e9)
attn = nn.Softmax(dim=-1)(scores)
context = torch.matmul(attn, V)
return context
class MultiHeadAttention(nn.Module):
def __init__(self,config):
super(MultiHeadAttention, self).__init__()
self.d_model = config.d_model
self.d_k = config.d_k
self.d_v = config.d_v
self.n_heads = config.n_heads
self.W_Q = nn.Linear(self.d_model, self.d_k * self.n_heads)
self.W_K = nn.Linear(self.d_model, self.d_k * self.n_heads)
self.W_V = nn.Linear(self.d_model, self.d_v * self.n_heads)
self.linear = nn.Linear(self.n_heads * self.d_v, self.d_model)
self.layer_norm = nn.LayerNorm(self.d_model)
def forward(self, Q, K, V, attn_mask):
"""
数据形状:
Q: [batch_size, len_q, d_model]
K: [batch_size, len_k, d_model]
V: [batch_size, len_k, d_model]
"""
residual, batch_size = Q, Q.size(0)
q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_v).transpose(1, 2)
attn_mask = attn_mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1)
context = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_v)
"""
contiguous()的作用要从pytorch多维数组的低层存储开始说起,
它一般是配合torch.permute()、torch.transpose()、torch.view()一起使用
以上的方法对张量改变“形状”其实并没有改变张量在内存中真正的形状,只是改变了访问策略罢了
而torch.contiguous()方法首先拷贝了一份张量在内存中的地址,然后将地址按照形状改变后的张量的语义进行排列
也就是说它改变了内存中的存储方式。
"""
output = self.linear(context)
output = context + residual
output = self.layer_norm(output)
return output
class PoswiseFeedForwardNet(nn.Module):
def __init__(self,config):
super(PoswiseFeedForwardNet, self).__init__()
self.d_model = config.d_model
self.d_ff = config.d_ff
self.conv1 = nn.Conv1d(in_channels=self.d_model, out_channels=self.d_ff, kernel_size=1)
self.conv2 = nn.Conv1d(in_channels=self.d_ff, out_channels=self.d_model, kernel_size=1)
self.layer_norm = nn.LayerNorm(self.d_model)
def forward(self, inputs):
residual = inputs
output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
output = self.conv2(output).transpose(1, 2)
output = self.layer_norm(output + residual)
return output
class PositionalEncoding(nn.Module):
def __init__(self,config):
super(PositionalEncoding, self).__init__()
self.dropout = config.dropout
self.d_model = config.d_model
self.max_len = config.max_len
self.dropout = nn.Dropout(p=self.dropout)
pe = torch.zeros(self.max_len, self.d_model)
position = torch.arange(0, self.max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(-1 * math.log(10000.0) * (torch.arange(0, self.d_model, 2).float() / self.d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1), :]
return self.dropout(x)
class EncoderLayer(nn.Module):
def __init__(self,config):
super(EncoderLayer, self).__init__()
self.enc_self_attn = MultiHeadAttention(config)
self.pos_ffn = PoswiseFeedForwardNet(config)
def forward(self, enc_inputs, enc_self_attn_mask):
enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
enc_outputs = self.pos_ffn(enc_outputs)
return enc_outputs
class Encoder(nn.Module):
def __init__(self,config):
super(Encoder, self).__init__()
self.d_model = config.d_model
self.src_vocab_size = config.src_vocab_size
self.enc_n_layers = config.enc_n_layers
self.src_emb = nn.Embedding(self.src_vocab_size, self.d_model)
self.pos_emb = PositionalEncoding(config)
self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(self.enc_n_layers)])
def forward(self, enc_inputs):
enc_outputs = self.src_emb(enc_inputs)
enc_outputs = self.pos_emb(enc_outputs)
enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
for layer in self.layers:
enc_outputs = layer(enc_outputs, enc_self_attn_mask)
return enc_outputs
def get_attn_subsequent_mask(tgt_len):
attn_shape = [1, tgt_len, tgt_len]
subsequence_mask = np.triu(np.ones(attn_shape), k=1)
subsequence_mask = torch.from_numpy(subsequence_mask).byte()
return subsequence_mask
class DecoderLayer(nn.Module):
def __init__(self,config):
super(DecoderLayer, self).__init__()
self.dec_self_attn = MultiHeadAttention(config)
self.dec_enc_attn = MultiHeadAttention(config)
self.pos_ffn = PoswiseFeedForwardNet(config)
def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
dec_outputs = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
dec_outputs = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
dec_outputs = self.pos_ffn(dec_outputs)
return dec_outputs
class Decoder(nn.Module):
def __init__(self,config):
super(Decoder, self).__init__()
self.d_model = config.d_model
self.tgt_vocab_size = config.tgt_vocab_size
self.dec_n_layers = config.dec_n_layers
self.tgt_emb = nn.Embedding(self.tgt_vocab_size, self.d_model)
self.pos_emb = PositionalEncoding(config)
self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(self.dec_n_layers)])
def forward(self, dec_inputs, enc_inputs, enc_outputs):
dec_outputs = self.tgt_emb(dec_inputs)
dec_outputs = self.pos_emb(dec_outputs)
dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)
tgt_len = dec_inputs.size(1)
dec_self_attn_subsequent_mask = get_attn_subsequent_mask(tgt_len)
dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)
dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)
for layer in self.layers:
dec_outputs = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
return dec_outputs
class Transformer(nn.Module):
def __init__(self,config):
super(Transformer, self).__init__()
self.d_model = config.d_model
self.tgt_vocab_size = config.tgt_vocab_size
self.encoder = Encoder(config)
self.decoder = Decoder(config)
self.projection = nn.Linear(self.d_model, self.tgt_vocab_size, bias=False)
self.softmax = F.softmax
def forward(self, enc_inputs, dec_inputs):
enc_outputs = self.encoder(enc_inputs)
dec_outputs = self.decoder(dec_inputs, enc_inputs, enc_outputs)
dec_logits = self.projection(dec_outputs)
dec_logits = self.softmax(dec_logits, dim=-1)
return dec_logits
二. 模型参数的集中化管理
import json
class ModelConfig:
def __init__(self,
d_k = None,
d_v = None,
d_model = None,
src_vocab_size = None,
tgt_vocab_size = None,
enc_n_layers = 6,
dec_n_layers = 6,
dropout = 0.1,
max_len = 5000,
n_heads = 8,
d_ff = 2048,
):
self.d_k = d_k
self.d_v = d_v
self.d_model = d_model
self.src_vocab_size = src_vocab_size
self.tgt_vocab_size = tgt_vocab_size
self.enc_n_layers = enc_n_layers
self.dec_n_layers = dec_n_layers
self.dropout = dropout
self.max_len = max_len
self.n_heads = n_heads
self.d_ff = d_ff
def save(self, save_path):
f = open(save_path, "w")
d = {
"d_k": self.d_k,
"d_v": self.d_v,
"d_model": self.d_model,
"src_vocab_size": self.src_vocab_size,
"tgt_vocab_size": self.tgt_vocab_size,
"enc_n_layers": self.enc_n_layers,
"dec_n_layers": self.dec_n_layers,
"dropout": self.dropout,
"max_len": self.max_len,
"n_heads": self.n_heads,
"d_ff": self.d_ff
}
d = json.dumps(d)
f.write(d)
f.close()
def load(self, load_path):
d = open(load_path).read()
d = json.loads(d)
self.d_k = d["d_k"]
self.d_v = d["d_v"]
self.d_model = d["d_model"]
self.src_vocab_size = d["src_vocab_size"]
self.tgt_vocab_size = d["tgt_vocab_size"]
self.enc_n_layers = d["enc_n_layers"]
self.dec_n_layers = d["dec_n_layers"]
self.dropout = d["dropout"]
self.max_len = d["max_len"]
self.n_heads = d["n_heads"]
self.d_ff = d["d_ff"]
三. 验证代码
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import math
def make_batch(sentences):
input_batch = [[src_vocab[n] for n in sentences[0].split()]]
output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]
target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]
return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)
if __name__ == '__main__':
sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']
src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}
src_vocab_size = len(src_vocab)
tgt_vocab = {'P': 0, 'i': 1, 'want': 2, 'a': 3, 'beer': 4, 'S': 5, 'E': 6}
tgt_vocab_size = len(tgt_vocab)
src_len = 5
tgt_len = 5
config = ModelConfig(d_k=64, d_v=64, d_model=512, src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size)
model = Transformer(config)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
enc_inputs, dec_inputs, target_batch = make_batch(sentences)
for epoch in range(50):
optimizer.zero_grad()
outputs = model(enc_inputs, dec_inputs)
outputs = outputs.squeeze(0)
loss = criterion(outputs, target_batch.contiguous().view(-1))
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
loss.backward()
optimizer.step()
四. 掩码机制的细节
- Encoder中为了消除引入padding带来的影响,引入了mask机制,详见001
get_attn_pad_mask
函数。 - Decoder中的掩码机制,是为了在训练过程中不给Decoder看当前时刻以后的正确答案,详见008
get_attn_subsequent_mask
函数。