一、最简单的attention
1.1 公式
公式如下:
1.2 代码实现
下面是Seq2Seq Attention的简易版。
def attention_layer(query, key,value):
query = tf.expand_dims(query, axis=1) # [bs, 1, hs]
score = tf.nn.softmax(tf.matmul(query, key, transpose_b=True), axis=-1)
# [bs, 1, hs] * [bs, hs, sl ] -> [bs, 1, sl]
content = tf.matmul(score, value)
# [bs, 1, sl] * [bs, sl, hs] -> [bs, 1, hs]
content = tf.squeeze(content, axis=1)
return content
1.3 结论
- tf.matmul()
矩阵乘法 - tf.multiply()
矩阵点乘,无broadcast机制。 - 这里的矩阵乘法可以由矩阵点乘代替
但矩阵乘法显然很省事。
# [batch_size, 1, hidden_size] * [batch_size, hidden_size, seq_len] -> [batch_size, 1, seq_len]
tf.matmul(query, key, transpose_b=True)
等价于下面的.
query = tf.tile(query, [1, key.value[1], 1]) # [batch_size, seq_len, hidden_size]
score = tf.multiply(query, key) # [batch_size, seq_len, hidden_size]
score = tf.reduce_sum(score, axis=-1) # [batch_size, seq_len]
score = tf.expand_dims(score, axis=1) # [batch_size, 1, seq_len]
二、多头self-attention
2.1 公式
2.2 代码实现
因为是多头,看公式。其实也只是把权重矩阵W,切分为多份。
def multi_self_attention(Q, K, V,
units, num_heads):
"""
:param units:
:param num_heads:
:param Q: [bs, seq_len, hidden_size]
:param K: [bs, seq_len, hidden_size]
:param V: [bs, seq_len, hidden_size]
:return:
"""
with tf.name_scope(name="multi_head"):
Q = tf.layers.dense(inputs=Q, units=units, activation=tf.nn.relu) # [bs, seq_len, hidden_size]
K = tf.layers.dense(inputs=K, units=units, activation=tf.nn.relu)
V = tf.layers.dense(inputs=V, units=units, activation=tf.nn.relu)
# 将Q, K, V 在最后一维上切分为多份.
# [bs, seq_len, num_heads, units/num_heads]
Q = tf.reshape(Q, [-1, Q.get_shape().as_list()[1], num_heads, units // num_heads])
K = tf.reshape(K, [-1, K.get_shape().as_list()[1], num_heads, units // num_heads])
V = tf.reshape(V, [-1, V.get_shape().as_list()[1], num_heads, units // num_heads])
Q = tf.transpose(Q, [0, 2, 1, 3]) # [batch_size, num_heads, seq_len, units/num_heads]
K = tf.transpose(K, [0, 2, 1, 3])
V = tf.transpose(V, [0, 2, 1, 3])
with tf.name_scope(name="self-attention"):
score = tf.matmul(Q, K, transpose_b=True) # [bs, num_heads, seq_len, seq_len]
score /= tf.math.sqrt(tf.cast(Q.get_shape().as_list()[-1], tf.float32))
score = tf.nn.softmax(score, axis=-1)
content = tf.matmul(score, V) # [bs, num_heads, seq_len, units/num_heads]
content = tf.transpose(content, [0, 2, 1, 3]) # [bs, seq_len, num_heads, units/num_heads]
content = tf.reshape(content, [-1, tf.shape(content)[1], units]) # [bs, seq_len, units]
return content
2.3 结论
- 将不参与计算的维度前置
因为矩阵乘法,只有后两维参与计算。所以对于[batch_size, seq_len, hidden_size, num_head], 考虑将num_head前置,变成[batch_size, num_head, seq_len, hidden_size]。 - 少使用tf.shape(), 用.get_shape().as_list()
使用tf.reshape()不要使用tf.shape()函数,会出现问题的。
input = tf.placeholder([None, 3, 4], dtype=tf.float32)
tf.shape(input)[1] # 运算结果是?,因为是tf.shape()返回的是Tensor对象,不赋值就是?
input.get_shape().as_list() # 运算结果是 [None, 3, 4]。