Transformer学习笔记

整体流程架构图

在这里插入图片描述

关于embedding和position Encoding的代码实现举例

## word embedding 序列建模
## source sentence 和 target sentence
## 序列建模构建一个序列 序列的字符是用词表中的索引表示
batch_size = 2

# 单词表大小
max_num_src_words = 8
max_num_tgt_words = 8
#embedding 用到的
model_dim = 8

#序列的最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5
#pos用到的
max_position_len=5
#src_len = torch.randint(2,5,(batch_size,))
#tgt_len = torch.randint(2,5,(batch_size,))

src_len = torch.Tensor([2,4]).to(torch.int32)
tgt_len = torch.Tensor([4,3]).to(torch.int32)
# print(src_len)#第一个句子长度为2第二个句子长度为4
# tensor([2, 4], dtype=torch.int32)

#以单词索引构成的句子
# src_seq = [torch.randint(1,max_num_src_words,(L,)) for L in src_len]
# tgt_seq = [torch.randint(1,max_num_tgt_words,(L,)) for L in tgt_len]
# print(src_seq)#生成句子序列,在1-8中随机取数,取的是L个L来自src_len 比如第一个就是2个,第二个就是4个
# [tensor([6, 1]), tensor([7, 7, 4, 4])]

#由于原序列和目标序列的长度不一样所以要进行padding操作
# src_seq = [F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max_src_seq_len-L)) for L in src_len]
# tgt_seq = [F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max_tgt_seq_len-L)) for L in tgt_len]
# print(src_seq)#默认填充0到句子规定长度
# [tensor([6, 7, 0, 0, 0]), tensor([5, 2, 3, 5, 0])]

#将列表转化为tensor batch_size×max_seq_len大小的张量
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max(src_len)-L)),0)\
                     for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max(tgt_len)-L)),0)\
                     for L in tgt_len])
#print(src_seq)
# tensor([[3, 4, 0, 0, 0],
#         [2, 3, 5, 6, 0]])

#构造word embedding
src_embedding_table = nn.Embedding(max_num_src_words+1,model_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1,model_dim)
src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)
# print(src_embedding_table.weight)
# Parameter containing:
# tensor([[-1.1492,  0.3123,  0.3638, -1.2436,  0.8841, -1.3182, -0.8704,  1.5792],
#         [ 1.2460, -0.1750, -0.3522, -1.3436, -0.8057, -0.0135,  0.1811, -0.1513],
#         [-0.6754,  0.7832, -0.2679,  0.1484, -0.7402,  1.5407,  0.2918, -0.7214],
#         [ 1.0883, -0.2671, -0.0641,  0.6298,  0.8708,  1.3060, -1.0503,  2.8721],
#         [-0.1510, -1.1240, -1.3180, -1.6120,  0.9943, -0.2064, -0.4703,  0.6330],
#         [ 0.0785, -0.1495,  1.3904,  1.0681, -0.1346,  0.1174,  0.6686, -0.6506],
#         [ 0.2084, -1.5857,  1.2090, -1.6490, -2.5377, -1.7984, -0.4302,  0.1567],
#         [ 1.7047, -0.7803, -0.5925, -0.8134,  1.6833,  1.4084, -0.4258, -1.0197],
#         [-1.2266,  0.4068,  0.7601, -1.0661,  0.4481, -1.2276,  1.1827, -1.5780]],
#        requires_grad=True)
#第0行代表padding的向量,下面每一行都是索引对应的向量(8个)
# print(src_embedding_table.weight)
# print(src_seq)
# print(src_embedding)

#构造position embedding
pos_mat = torch.arange(max_position_len).reshape((-1,1))
i_mat = torch.pow(10000,torch.arange(0,8,2).reshape((1,-1))/model_dim)
pe_embedding_table = torch.zeros(max_position_len,model_dim)
pe_embedding_table[:,0::2] = torch.sin(pos_mat/i_mat)
pe_embedding_table[:,1::2] = torch.cos(pos_mat/i_mat)

pe_embedding = nn.Embedding(max_position_len,model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table,requires_grad=False)

src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)),0) for _ in src_len]).to(torch.int32)
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)),0) for _ in tgt_len]).to(torch.int32)
# print(src_pos)
# [tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3])]

src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)
#print(pos_mat)
# tensor([0, 1, 2, 3, 4])
# print(pe_embedding_table)
# tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
#           1.0000e+00,  0.0000e+00,  1.0000e+00],
#         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
#           9.9995e-01,  1.0000e-03,  1.0000e+00],
#         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
#           9.9980e-01,  2.0000e-03,  1.0000e+00],
#         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
#           9.9955e-01,  3.0000e-03,  1.0000e+00],
#         [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
#           9.9920e-01,  4.0000e-03,  9.9999e-01]])

源码分析

nn.Transformer

主要参数

d_model: int = 512, 模型的输出维度,每个输出结果都是512维
nhead: int = 8, 多头注意力机制中有8个head
num_encoder_layers: int = 6,6个encoder层
num_decoder_layers: int = 6,6个decoder层
dim_feedforward: int = 2048,全连接层中间的维度,多层注意力机制的出书结果会先映射到2048这个维度,然后再映射回512维

主要的类

        if custom_encoder is not None:
            self.encoder = custom_encoder
        else:
            encoder_layer = TransformerEncoderLayer(
                d_model,
                nhead,
                dim_feedforward,
                dropout,
                activation,
                layer_norm_eps,
                batch_first,
                norm_first,
                bias,
                **factory_kwargs,
            )
            encoder_norm = LayerNorm(
                d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs
            )
            self.encoder = TransformerEncoder(
                encoder_layer, num_encoder_layers, encoder_norm
            )

        if custom_decoder is not None:
            self.decoder = custom_decoder
        else:
            decoder_layer = TransformerDecoderLayer(
                d_model,
                nhead,
                dim_feedforward,
                dropout,
                activation,
                layer_norm_eps,
                batch_first,
                norm_first,
                bias,
                **factory_kwargs,
            )
            decoder_norm = LayerNorm(
                d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs
            )
            self.decoder = TransformerDecoder(
                decoder_layer, num_decoder_layers, decoder_norm
            )

encoder_layer: multi head attention,add,layernorm,ffn(前馈神经网络)
TransformerEncoder: 6个encoder_layer
decoder_layer: self-attention,交叉注意力机制,ffn,add, layernorm
TransformerDecoder: 6个decoder_layer

def forward

        memory = self.encoder(
            src,
            mask=src_mask,
            src_key_padding_mask=src_key_padding_mask,
            is_causal=src_is_causal,
        )
        output = self.decoder(
            tgt,
            memory,
            tgt_mask=tgt_mask,
            memory_mask=memory_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask,
            tgt_is_causal=tgt_is_causal,
            memory_is_causal=memory_is_causal,
        )
        return output

forward函数读取来自encoder的memory,然后送入decoder与tgt得到output。(encoder读取的是src)。
output的输出结果是字符的概率。

TransformerEncoderLayer

        self.self_attn = MultiheadAttention(
            d_model,
            nhead,
            dropout=dropout,
            bias=bias,
            batch_first=batch_first,
            **factory_kwargs,
        )
        # Implementation of Feedforward model
        self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)

        self.norm_first = norm_first
        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

多头注意力。ffn中的两个线性层和一个激活层。进行了两次layernorm,第一个是在self-attention后,第二个在ffn后。

        x = src
        else:
            x = self.norm1(
            	x + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal)
            )
            x = self.norm2(x + self._ff_block(x))

        return x

forward的函数里x为输入句子,x作为了self-attention里的q,v,k然后与x做残差连接,进行层归一化。
把上一部分的x进行ffn后与x做残差连接然后层归一化。

TransformerEncoder

    def __init__(
        self,
        encoder_layer: "TransformerEncoderLayer",
        num_layers: int,
        norm: Optional[Module] = None,
        enable_nested_tensor: bool = True,
        mask_check: bool = True,
    ) 

传入参数有encoder_layer和num_layers。

        for mod in self.layers:
            output = mod(
                output,
                src_mask=mask,
                is_causal=is_causal,
                src_key_padding_mask=src_key_padding_mask_for_layers,
            )

        if self.norm is not None:
            output = self.norm(output)

        return output

每一层的输出都是下一层的输入。

TransformerDecoderLayer

        self.self_attn = MultiheadAttention(
            d_model,
            nhead,
            dropout=dropout,
            batch_first=batch_first,
            bias=bias,
            **factory_kwargs,
        )
        self.multihead_attn = MultiheadAttention(
            d_model,
            nhead,
            dropout=dropout,
            batch_first=batch_first,
            bias=bias,
            **factory_kwargs,
        )

自注意力机制:对tgt进行计算
交叉注意力机制:对tgt和来自encoder的memory进行计算

        x = tgt
        else:
            x = self.norm1(
                x + self._sa_block(x, tgt_mask, tgt_key_padding_mask, tgt_is_causal)
            )
            x = self.norm2(
                x
                + self._mha_block(
                    x, memory, memory_mask, memory_key_padding_mask, memory_is_causal
                )
            )
            x = self.norm3(x + self._ff_block(x))

        return x

对tgt进行自注意力计算与x相加进行残差连接然后层归一化。
从第一个模块得到的x和memory进行注意力机制计算然后残差连接层归一化。
从第二个模块得到的x进行ffn,残差连接,层归一化。

    def _sa_block(
        self,
        x: Tensor,
        attn_mask: Optional[Tensor],
        key_padding_mask: Optional[Tensor],
        is_causal: bool = False,
    ) -> Tensor:
        x = self.self_attn(
            x,
            x,
            x,
            attn_mask=attn_mask,
            key_padding_mask=key_padding_mask,
            is_causal=is_causal,
            need_weights=False,
        )[0]
        return self.dropout1(x)

    # multihead attention block
    def _mha_block(
        self,
        x: Tensor,
        mem: Tensor,
        attn_mask: Optional[Tensor],
        key_padding_mask: Optional[Tensor],
        is_causal: bool = False,
    ) -> Tensor:
        x = self.multihead_attn(
            x,
            mem,
            mem,
            attn_mask=attn_mask,
            key_padding_mask=key_padding_mask,
            is_causal=is_causal,
            need_weights=False,
        )[0]
        return self.dropout2(x)

self-attention中传入3个x,在mha中传入的是x,mem,mem。

TransformerDecoder

        output = tgt
        for mod in self.layers:
            output = mod(
                output,
                memory,
                tgt_mask=tgt_mask,
                memory_mask=memory_mask,
                tgt_key_padding_mask=tgt_key_padding_mask,
                memory_key_padding_mask=memory_key_padding_mask,
                tgt_is_causal=tgt_is_causal,
                memory_is_causal=memory_is_causal,
            )

        if self.norm is not None:
            output = self.norm(output)

        return output

将每一层的output都作为下一层的输入。
attention的公式计算
query:查询 key:关键字 value:信息

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值