Transformer学习笔记
整体流程架构图
关于embedding和position Encoding的代码实现举例
## word embedding 序列建模
## source sentence 和 target sentence
## 序列建模构建一个序列 序列的字符是用词表中的索引表示
batch_size = 2
# 单词表大小
max_num_src_words = 8
max_num_tgt_words = 8
#embedding 用到的
model_dim = 8
#序列的最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5
#pos用到的
max_position_len=5
#src_len = torch.randint(2,5,(batch_size,))
#tgt_len = torch.randint(2,5,(batch_size,))
src_len = torch.Tensor([2,4]).to(torch.int32)
tgt_len = torch.Tensor([4,3]).to(torch.int32)
# print(src_len)#第一个句子长度为2第二个句子长度为4
# tensor([2, 4], dtype=torch.int32)
#以单词索引构成的句子
# src_seq = [torch.randint(1,max_num_src_words,(L,)) for L in src_len]
# tgt_seq = [torch.randint(1,max_num_tgt_words,(L,)) for L in tgt_len]
# print(src_seq)#生成句子序列,在1-8中随机取数,取的是L个L来自src_len 比如第一个就是2个,第二个就是4个
# [tensor([6, 1]), tensor([7, 7, 4, 4])]
#由于原序列和目标序列的长度不一样所以要进行padding操作
# src_seq = [F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max_src_seq_len-L)) for L in src_len]
# tgt_seq = [F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max_tgt_seq_len-L)) for L in tgt_len]
# print(src_seq)#默认填充0到句子规定长度
# [tensor([6, 7, 0, 0, 0]), tensor([5, 2, 3, 5, 0])]
#将列表转化为tensor batch_size×max_seq_len大小的张量
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max(src_len)-L)),0)\
for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max(tgt_len)-L)),0)\
for L in tgt_len])
#print(src_seq)
# tensor([[3, 4, 0, 0, 0],
# [2, 3, 5, 6, 0]])
#构造word embedding
src_embedding_table = nn.Embedding(max_num_src_words+1,model_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1,model_dim)
src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)
# print(src_embedding_table.weight)
# Parameter containing:
# tensor([[-1.1492, 0.3123, 0.3638, -1.2436, 0.8841, -1.3182, -0.8704, 1.5792],
# [ 1.2460, -0.1750, -0.3522, -1.3436, -0.8057, -0.0135, 0.1811, -0.1513],
# [-0.6754, 0.7832, -0.2679, 0.1484, -0.7402, 1.5407, 0.2918, -0.7214],
# [ 1.0883, -0.2671, -0.0641, 0.6298, 0.8708, 1.3060, -1.0503, 2.8721],
# [-0.1510, -1.1240, -1.3180, -1.6120, 0.9943, -0.2064, -0.4703, 0.6330],
# [ 0.0785, -0.1495, 1.3904, 1.0681, -0.1346, 0.1174, 0.6686, -0.6506],
# [ 0.2084, -1.5857, 1.2090, -1.6490, -2.5377, -1.7984, -0.4302, 0.1567],
# [ 1.7047, -0.7803, -0.5925, -0.8134, 1.6833, 1.4084, -0.4258, -1.0197],
# [-1.2266, 0.4068, 0.7601, -1.0661, 0.4481, -1.2276, 1.1827, -1.5780]],
# requires_grad=True)
#第0行代表padding的向量,下面每一行都是索引对应的向量(8个)
# print(src_embedding_table.weight)
# print(src_seq)
# print(src_embedding)
#构造position embedding
pos_mat = torch.arange(max_position_len).reshape((-1,1))
i_mat = torch.pow(10000,torch.arange(0,8,2).reshape((1,-1))/model_dim)
pe_embedding_table = torch.zeros(max_position_len,model_dim)
pe_embedding_table[:,0::2] = torch.sin(pos_mat/i_mat)
pe_embedding_table[:,1::2] = torch.cos(pos_mat/i_mat)
pe_embedding = nn.Embedding(max_position_len,model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table,requires_grad=False)
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)),0) for _ in src_len]).to(torch.int32)
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)),0) for _ in tgt_len]).to(torch.int32)
# print(src_pos)
# [tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3])]
src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)
#print(pos_mat)
# tensor([0, 1, 2, 3, 4])
# print(pe_embedding_table)
# tensor([[ 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
# 1.0000e+00, 0.0000e+00, 1.0000e+00],
# [ 8.4147e-01, 5.4030e-01, 9.9833e-02, 9.9500e-01, 9.9998e-03,
# 9.9995e-01, 1.0000e-03, 1.0000e+00],
# [ 9.0930e-01, -4.1615e-01, 1.9867e-01, 9.8007e-01, 1.9999e-02,
# 9.9980e-01, 2.0000e-03, 1.0000e+00],
# [ 1.4112e-01, -9.8999e-01, 2.9552e-01, 9.5534e-01, 2.9995e-02,
# 9.9955e-01, 3.0000e-03, 1.0000e+00],
# [-7.5680e-01, -6.5364e-01, 3.8942e-01, 9.2106e-01, 3.9989e-02,
# 9.9920e-01, 4.0000e-03, 9.9999e-01]])
源码分析
nn.Transformer
主要参数
d_model: int = 512, 模型的输出维度,每个输出结果都是512维
nhead: int = 8, 多头注意力机制中有8个head
num_encoder_layers: int = 6,6个encoder层
num_decoder_layers: int = 6,6个decoder层
dim_feedforward: int = 2048,全连接层中间的维度,多层注意力机制的出书结果会先映射到2048这个维度,然后再映射回512维
主要的类
if custom_encoder is not None:
self.encoder = custom_encoder
else:
encoder_layer = TransformerEncoderLayer(
d_model,
nhead,
dim_feedforward,
dropout,
activation,
layer_norm_eps,
batch_first,
norm_first,
bias,
**factory_kwargs,
)
encoder_norm = LayerNorm(
d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs
)
self.encoder = TransformerEncoder(
encoder_layer, num_encoder_layers, encoder_norm
)
if custom_decoder is not None:
self.decoder = custom_decoder
else:
decoder_layer = TransformerDecoderLayer(
d_model,
nhead,
dim_feedforward,
dropout,
activation,
layer_norm_eps,
batch_first,
norm_first,
bias,
**factory_kwargs,
)
decoder_norm = LayerNorm(
d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs
)
self.decoder = TransformerDecoder(
decoder_layer, num_decoder_layers, decoder_norm
)
encoder_layer: multi head attention,add,layernorm,ffn(前馈神经网络)
TransformerEncoder: 6个encoder_layer
decoder_layer: self-attention,交叉注意力机制,ffn,add, layernorm
TransformerDecoder: 6个decoder_layer
def forward
memory = self.encoder(
src,
mask=src_mask,
src_key_padding_mask=src_key_padding_mask,
is_causal=src_is_causal,
)
output = self.decoder(
tgt,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
tgt_key_padding_mask=tgt_key_padding_mask,
memory_key_padding_mask=memory_key_padding_mask,
tgt_is_causal=tgt_is_causal,
memory_is_causal=memory_is_causal,
)
return output
forward函数读取来自encoder的memory,然后送入decoder与tgt得到output。(encoder读取的是src)。
output的输出结果是字符的概率。
TransformerEncoderLayer
self.self_attn = MultiheadAttention(
d_model,
nhead,
dropout=dropout,
bias=bias,
batch_first=batch_first,
**factory_kwargs,
)
# Implementation of Feedforward model
self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
self.dropout = Dropout(dropout)
self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
self.norm_first = norm_first
self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
self.dropout1 = Dropout(dropout)
self.dropout2 = Dropout(dropout)
多头注意力。ffn中的两个线性层和一个激活层。进行了两次layernorm,第一个是在self-attention后,第二个在ffn后。
x = src
else:
x = self.norm1(
x + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal)
)
x = self.norm2(x + self._ff_block(x))
return x
forward的函数里x为输入句子,x作为了self-attention里的q,v,k然后与x做残差连接,进行层归一化。
把上一部分的x进行ffn后与x做残差连接然后层归一化。
TransformerEncoder
def __init__(
self,
encoder_layer: "TransformerEncoderLayer",
num_layers: int,
norm: Optional[Module] = None,
enable_nested_tensor: bool = True,
mask_check: bool = True,
)
传入参数有encoder_layer和num_layers。
for mod in self.layers:
output = mod(
output,
src_mask=mask,
is_causal=is_causal,
src_key_padding_mask=src_key_padding_mask_for_layers,
)
if self.norm is not None:
output = self.norm(output)
return output
每一层的输出都是下一层的输入。
TransformerDecoderLayer
self.self_attn = MultiheadAttention(
d_model,
nhead,
dropout=dropout,
batch_first=batch_first,
bias=bias,
**factory_kwargs,
)
self.multihead_attn = MultiheadAttention(
d_model,
nhead,
dropout=dropout,
batch_first=batch_first,
bias=bias,
**factory_kwargs,
)
自注意力机制:对tgt进行计算
交叉注意力机制:对tgt和来自encoder的memory进行计算
x = tgt
else:
x = self.norm1(
x + self._sa_block(x, tgt_mask, tgt_key_padding_mask, tgt_is_causal)
)
x = self.norm2(
x
+ self._mha_block(
x, memory, memory_mask, memory_key_padding_mask, memory_is_causal
)
)
x = self.norm3(x + self._ff_block(x))
return x
对tgt进行自注意力计算与x相加进行残差连接然后层归一化。
从第一个模块得到的x和memory进行注意力机制计算然后残差连接层归一化。
从第二个模块得到的x进行ffn,残差连接,层归一化。
def _sa_block(
self,
x: Tensor,
attn_mask: Optional[Tensor],
key_padding_mask: Optional[Tensor],
is_causal: bool = False,
) -> Tensor:
x = self.self_attn(
x,
x,
x,
attn_mask=attn_mask,
key_padding_mask=key_padding_mask,
is_causal=is_causal,
need_weights=False,
)[0]
return self.dropout1(x)
# multihead attention block
def _mha_block(
self,
x: Tensor,
mem: Tensor,
attn_mask: Optional[Tensor],
key_padding_mask: Optional[Tensor],
is_causal: bool = False,
) -> Tensor:
x = self.multihead_attn(
x,
mem,
mem,
attn_mask=attn_mask,
key_padding_mask=key_padding_mask,
is_causal=is_causal,
need_weights=False,
)[0]
return self.dropout2(x)
self-attention中传入3个x,在mha中传入的是x,mem,mem。
TransformerDecoder
output = tgt
for mod in self.layers:
output = mod(
output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
tgt_key_padding_mask=tgt_key_padding_mask,
memory_key_padding_mask=memory_key_padding_mask,
tgt_is_causal=tgt_is_causal,
memory_is_causal=memory_is_causal,
)
if self.norm is not None:
output = self.norm(output)
return output
将每一层的output都作为下一层的输入。
attention的公式计算
query:查询 key:关键字 value:信息