用tensorflow的layers.Layer模块改写class SelfAttention(nn.Module): def init(self,in_c,out_c,fm_sz,pos_bias = False): super(SelfAttention,self).init() self.w_q = nn.Conv2d(in_channels = in_c,out_channels = out_c,kernel_size = 1) self.w_k = nn.Conv2d(in_channels = in_c,out_channels = out_c,kernel_size = 1) self.w_v = nn.Conv2d(in_channels = in_c,out_channels = out_c,kernel_size = 1) self.pos_code = self.__getPosCode(fm_sz,out_c) self.softmax = nn.Softmax(dim = 2) self.pos_bias = pos_bias def __getPosCode(self,fm_sz,out_c): x = [] for i in range(fm_sz): x.append([np.sin,np.cos][i % 2](1 / (10000 ** (i // 2 / fm_sz)))) x = torch.from_numpy(np.array([x])).float() return torch.cat([(x + x.t()).unsqueeze(0) for i in range(out_c)]) def forward(self,x): q,k,v = self.w_q(x),self.w_k(x),self.w_v(x) pos_code = torch.cat([self.pos_code.unsqueeze(0) for i in range(x.shape[0])]).to(x.device) if self.pos_bias: att_map = torch.matmul(q,k.permute(0,1,3,2)) + pos_code else: att_map = torch.matmul(q,k.permute(0,1,3,2)) + torch.matmul(q,pos_code.permute(0,1,3,2)) am_shape = att_map.shape att_map = self.softmax(att_map.view(am_shape[0],am_shape[1],am_shape[2] * am_shape[3])).view(am_shape) return att_map * v