1. 在代码中,multihead attention(X)的形状和X形状一样,所以才方便将其相加,multihead attention代码和attention代码如下,,在代码中,w_q,w_k,w_v的形状都是(d_model, d_model),得到Q,K,V后将其d_model拆成(h, d_k=d_model//h),最后再拼接成d_model,注意K、V形状一样,Q形状不一定和K、V形状一样
def attention(query, key, value, mask=None, dropout=None):
"""
实现 Scaled Dot-Product Attention
:param query: 输入与Q矩阵相乘后的结果,size = (batch , h , L , d_model//h)
:param key: 输入与K矩阵相乘后的结果,size同上
:param value: 输入与V矩阵相乘后的结果,size同上
:param mask: 掩码矩阵
:param dropout: drop out
"""
d_k = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
# 计算QK/根号d_k,size=(batch,h,L,L)
if mask is not None:
# 掩码矩阵,编码器mask的size = [batch,1,1,src_L]
# 解码器mask的size= = [batch,1,tgt_L,tgt_L]
scores = scores.masked_fill(mask=mask, value=torch.tensor(-1e9))
p_attn = F.softmax(scores, dim = -1)
# 以最后一个维度进行softmax(也就是最内层的行),size = (batch,h,L,L)
if dropout is not None:
p_attn = dropout(p_attn)
return torch.matmul(p_attn, value), p_attn
# 与V相乘。第一个输出的size为(batch,h,L,d_model//h),第二个输出的size = (batch,h,L,L)
def clones(module, N):
"工具人函数,定义N个相同的模块"
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
class MultiHeadedAttention(nn.Module):
def __init__(self, h, d_model, dropout):
"""
实现多头注意力机制
:param h: 头数
:param d_model: word embedding维度
:param dropout: drop out
"""
super(MultiHeadedAttention, self).__init__()
assert d_model % h == 0
#检测word embedding维度是否能被h整除
# We assume d_v always equals d_k
self.d_k = d_model // h
self.h = h # 头的个数
self.linears = clones(nn.Linear(d_model, d_model), 4)
#四个线性变换,前三个为QKV三个变换矩阵,最后一个用于attention后
self.attn = None
self.dropout = nn.Dropout(p=dropout)
def forward(self, query, key, value, mask=None):
"""
:param query: 输入x,即(word embedding+postional embedding),size=[batch, L, d_model] tips:编解码器输入的L可能不同
:param key: 同上,size同上
:param value: 同上,size同上
:param mask: 掩码矩阵,编码器mask的size =