mask softmax的tensorflow实现

这篇博客探讨了原始TensorFlow中softmax计算存在的问题,即在mask位置的值仍会参与计算。作者提出了一种改进的mask_softmax方法,确保mask位置的权重变得更小。通过示例展示了新方法如何更准确地排除mask的影响,从而得到期望的softmax输出。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

原始的tensorflow实现是通过把一个mask的位置改成一个非常小的负数,然后加到原来的向量上实现的:

	 adder = (1.0 - tf.cast(mask, inputs.dtype)) * (
          _large_compatible_negative(inputs.dtype))

      # Since we are adding it to the raw scores before the softmax, this is
      # effectively the same as removing these entirely.
      inputs += adder
      if isinstance(self.axis, (tuple, list)):
      if len(self.axis) > 1:
        return tf.exp(inputs - tf.reduce_logsumexp(
            inputs, axis=self.axis, keepdims=True))
      else:
        return backend.softmax(inputs, axis=self.axis[0])

实测这样其实mask的位置还是会参与softmax的计算,例如:

key = tf.convert_to_tensor([[1,2,3],[4,-1,-1],[3,1,-1]],dtype=K.floatx())#3,3,3
mask = tf.cast(tf.not_equal(key,-1),K.floatx())

这样算出来的softmax输出是:

[[0.09003057 0.24472848 0.66524094]
 [0.9867033  0.00664835 0.00664835]
 [0.8668133  0.11731042 0.01587624]]

感觉不是很完美,想了一个完美一点的mask softmax,可以把对应mask位置的softmax权重改的更小一点:

from keras import backend as K

def mask_softmax(x,mask,axis=-1):
    max = K.max(x, axis=axis, keepdims=True)
    e = K.exp(x - max)
    e_mask = e * mask
    masked_sums = K.sum(e_mask, axis=axis, keepdims=True) + K.epsilon()
    mask_softmax = e_mask / masked_sums
    return mask_softmax
my_softmax = mask_softmax(key,mask,axis=-1)

对应的输出:

[[0.09003057 0.24472846 0.6652409 ]
 [0.9999999  0.         0.        ]
 [0.88079697 0.1192029  0.        ]]
这里是一个简单的Transformer模型的TensorFlow实现,仅供参考: ```python import tensorflow as tf class MultiHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model, num_heads): super(MultiHeadAttention, self).__init__() self.num_heads = num_heads self.d_model = d_model assert d_model % self.num_heads == 0 self.depth = d_model // self.num_heads self.wq = tf.keras.layers.Dense(d_model) self.wk = tf.keras.layers.Dense(d_model) self.wv = tf.keras.layers.Dense(d_model) self.dense = tf.keras.layers.Dense(d_model) def split_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, q, k, v, mask): batch_size = tf.shape(q)[0] q = self.wq(q) k = self.wk(k) v = self.wv(v) q = self.split_heads(q, batch_size) k = self.split_heads(k, batch_size) v = self.split_heads(v, batch_size) scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask) scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) output = self.dense(concat_attention) return output, attention_weights def scaled_dot_product_attention(q, k, v, mask): matmul_qk = tf.matmul(q, k, transpose_b=True) dk = tf.cast(tf.shape(k)[-1], tf.float32) scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) if mask is not None: scaled_attention_logits += (mask * -1e9) attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) output = tf.matmul(attention_weights, v) return output, attention_weights class EncoderLayer(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, dff, rate=0.1): super(EncoderLayer, self).__init__() self.mha = MultiHeadAttention(d_model, num_heads) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) def call(self, x, training, mask): attn_output, _ = self.mha(x, x, x, mask) attn_output = self.dropout1(attn_output, training=training) out1 = self.layernorm1(x + attn_output) ffn_output = self.ffn(out1) ffn_output = self.dropout2(ffn_output, training=training) out2 = self.layernorm2(out1 + ffn_output) return out2 def point_wise_feed_forward_network(d_model, dff): return tf.keras.Sequential([ tf.keras.layers.Dense(dff, activation='relu'), tf.keras.layers.Dense(d_model) ]) class Encoder(tf.keras.layers.Layer): def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1): super(Encoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model) self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model) self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)] self.dropout = tf.keras.layers.Dropout(rate) def call(self, x, training, mask): seq_len = tf.shape(x)[1] x = self.embedding(x) x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) x += self.pos_encoding[:, :seq_len, :] x = self.dropout(x, training=training) for i in range(self.num_layers): x = self.enc_layers[i](x, training, mask) return x def positional_encoding(position, d_model): angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model) # apply sin to even indices in the array; 2i sines = np.sin(angle_rads[:, 0::2]) # apply cos to odd indices in the array; 2i+1 cosines = np.cos(angle_rads[:, 1::2]) pos_encoding = np.concatenate([sines, cosines], axis=-1) pos_encoding = pos_encoding[np.newaxis, ...] return tf.cast(pos_encoding, dtype=tf.float32) def get_angles(pos, i, d_model): angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model)) return pos * angle_rates class Transformer(tf.keras.Model): def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1): super(Transformer, self).__init__() self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate) self.final_layer = tf.keras.layers.Dense(1, activation='sigmoid') def call(self, inp, training, enc_padding_mask): enc_output = self.encoder(inp, training, enc_padding_mask) final_output = self.final_layer(enc_output) return final_output ``` 这个模型包括了Transformer中的self-attention和feed-forward layers,可以作为一个encoder使用。可以根据需要进行修改和扩展。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值