Transformer and Self-Attention(Pytorch实现)

1. Attention

Attention函数的本质可以描述为:将查询(query)和一组键值(key-value)对映射到输出(output)。
在这里插入图片描述
从本质上理解,Attention是从大量信息中筛选出少量重要信息,并聚焦到这些重要信息上,忽略大多不重要的信息。权重越大越聚焦于其对应的Value值上,即权重代表了信息的重要性,而Value是其对应的信息。

2. Self-Attention

而自注意力机制是注意力机制的变体,其减少了对外部信息的依赖,更擅长捕捉数据或特征的内部相关性。具体讲就是,一个序列中的每个词都要和该序列中的所有词进行attention计算,目的是学习序列内部的词依赖关系,捕获句子的内部结构和相关性。这里主要参考Attention is all you need这篇论文中的实现方式进行介绍。

论文中,具体实现的self-attention机制为 Scaled Dot-Product Attention.
在这里插入图片描述
在这里插入图片描述
自注意力机制计算过程可以归纳为如下步骤:

  1. 将输入单词转化成嵌入向量表示形式;
  2. 根据每个词的嵌入向量分别得到q, k, v三个向量;
  3. 计算score:这里采用点积的计算方式得到value对应的分数 s c o r e = q ⋅ k score = q \cdot k score=qk;
  4. 为了梯度的稳定,得到分数后,Transformer对每一个权重分数除以 d k \sqrt{d_k} dk ;
  5. softmax激活函数,得到value对应的权重分数;
  6. 结果与v进行点积,得到每个输入向量的评分v,最后对v进行求和得到最终的输出结果。
    李宏毅老师课件
    实际的计算过程中,q, k, v会以矩阵的形式进行计算,分别对应公式中的Q,K和V。
    实现代码如下:
class ScaledDotProductAttention(nn.Module):
	def __init__(self):
		super (ScaledDotProductAttention, self).__init__()
	
	def forward(self, Q, K, V, attn_mask):
		scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) #size:[batch_size, n_heads, len_q, len_k], 这里len_q和len_k相等,为序列长度
		scores.masked_fill_(attn_mask, -1e9)	#对mask中数值为1的位置,填充数值
		attn = nn.Softmax(dim=-1)(scores)
		context = torch.matmul(attn, V)
		return context, attn

3. Multi-Head Attention

多头注意力机制可以让网络模型关注到来自不同位置的不同表示子空间的信息,
多头注意力机制
在这里插入图片描述
在多头注意力机制中,Q,K,V的形状大小分别为:
Q: [len_q, d_model]
K: [len_k, d_model]
V: [len_k, d_model]
其中设定len_q=len_k,对应序列长度,d_model对应特征维度。
此外,上式中参数 W i Q , W i K , W i V W_i^Q, W_i^K, W_i^V WiQ,WiK,Wi

以下是使用PyTorch实现TransformerSelf-Attention的示例代码: ## Self-Attention ```python import torch import torch.nn as nn class SelfAttention(nn.Module): def __init__(self, embed_size, heads): super(SelfAttention, self).__init__() self.embed_size = embed_size self.heads = heads self.head_dim = embed_size // heads assert (self.head_dim * heads == embed_size), "Embed size needs to be divisible by heads" self.values = nn.Linear(self.head_dim, self.head_dim, bias=False) self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False) self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False) self.fc_out = nn.Linear(heads * self.head_dim, embed_size) def forward(self, values, keys, queries, mask): # Get number of training examples N = queries.shape[0] value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1] # Split embedding into self.heads pieces values = values.reshape(N, value_len, self.heads, self.head_dim) keys = keys.reshape(N, key_len, self.heads, self.head_dim) queries = queries.reshape(N, query_len, self.heads, self.head_dim) # Transpose to get dimensions batch_size * self.heads * seq_len * self.head_dim values = values.permute(0, 2, 1, 3) keys = keys.permute(0, 2, 1, 3) queries = queries.permute(0, 2, 1, 3) # Calculate energy energy = torch.matmul(queries, keys.permute(0, 1, 3, 2)) if mask is not None: energy = energy.masked_fill(mask == 0, float("-1e20")) # Apply softmax to get attention scores attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=-1) # Multiply attention scores with values out = torch.matmul(attention, values) # Concatenate and linearly transform output out = out.permute(0, 2, 1, 3).reshape(N, query_len, self.heads * self.head_dim) out = self.fc_out(out) return out ``` ## Transformer ```python import torch import torch.nn as nn from torch.nn.modules.activation import MultiheadAttention class TransformerBlock(nn.Module): def __init__(self, embed_size, heads, dropout, forward_expansion): super(TransformerBlock, self).__init__() self.attention = MultiheadAttention(embed_dim=embed_size, num_heads=heads) self.norm1 = nn.LayerNorm(embed_size) self.norm2 = nn.LayerNorm(embed_size) self.feed_forward = nn.Sequential( nn.Linear(embed_size, forward_expansion * embed_size), nn.ReLU(), nn.Linear(forward_expansion * embed_size, embed_size) ) self.dropout = nn.Dropout(dropout) def forward(self, value, key, query, mask): attention_output, _ = self.attention(query, key, value, attn_mask=mask) x = self.dropout(self.norm1(attention_output + query)) forward_output = self.feed_forward(x) out = self.dropout(self.norm2(forward_output + x)) return out class Encoder(nn.Module): def __init__(self, src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_length): super(Encoder, self).__init__() self.embed_size = embed_size self.device = device self.word_embedding = nn.Embedding(src_vocab_size, embed_size) self.position_embedding = nn.Embedding(max_length, embed_size) self.layers = nn.ModuleList([ TransformerBlock(embed_size, heads, dropout, forward_expansion) for _ in range(num_layers) ]) self.dropout = nn.Dropout(dropout) def forward(self, x, mask): N, seq_length = x.shape positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device) out = self.dropout(self.word_embedding(x) + self.position_embedding(positions)) for layer in self.layers: out = layer(out, out, out, mask) return out class DecoderBlock(nn.Module): def __init__(self, embed_size, heads, forward_expansion, dropout, device): super(DecoderBlock, self).__init__() self.norm = nn.LayerNorm(embed_size) self.attention = MultiheadAttention(embed_size, heads) self.transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion) self.dropout = nn.Dropout(dropout) def forward(self, x, value, key, src_mask, trg_mask): attention_output, _ = self.attention(x, x, x, attn_mask=trg_mask) query = self.dropout(self.norm(attention_output + x)) out = self.transformer_block(value, key, query, src_mask) return out class Decoder(nn.Module): def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length): super(Decoder, self).__init__() self.embed_size = embed_size self.device = device self.word_embedding = nn.Embedding(trg_vocab_size, embed_size) self.position_embedding = nn.Embedding(max_length, embed_size) self.layers = nn.ModuleList([ DecoderBlock(embed_size, heads, forward_expansion, dropout, device) for _ in range(num_layers) ]) self.fc_out = nn.Linear(embed_size, trg_vocab_size) self.dropout = nn.Dropout(dropout) def forward(self, x, enc_out, src_mask, trg_mask): N, seq_length = x.shape positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device) x = self.dropout(self.word_embedding(x) + self.position_embedding(positions)) for layer in self.layers: x = layer(x, enc_out, enc_out, src_mask, trg_mask) out = self.fc_out(x) return out ``` 这些代码可以用于实现TransformerSelf-Attention模型。但这只是示例,你需要根据你的数据和任务来调整这些代码中的各种超参数和结构。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Aidanmomo

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值