文本分类——搭建Transformer模型
本文参考博客有:https://blog.youkuaiyun.com/qq_44574333/article/details/109637755
注:本系列仅帮助大家快速理解、学习并能独立使用相关框架进行深度学习的研究,理论部分还请自行学习补充,每个框架的官方经典案例写的都非常好,很值得进行学习使用。可以说在完全理解官方经典案例后加以修改便可以解决大多数常见的相关任务。
摘要:搭建Transformer模型解决文本分类问题
1 Description
简介:搭建Transformer作为Keras的一个层,并用它解决文本分类问题
2 Setup
导入所需包
#! -*- coding: utf-8 -*-
from __future__ import print_function
from keras import backend as K
from keras.engine.topology import Layer
from keras.preprocessing import sequence
from keras.datasets import imdb
from keras.layers import Embedding, Dropout, Dense
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.layers import *
from keras_layer_normalization import LayerNormalization
3 Implement multi head self attention as a Keras layer
实现Multi-head self-attention作为Keras的一个层
在此,Fig1 给出Transformer整体结构图,Fig2 给出了一个示例图,对于Transformer忘记或者不熟悉的读者可以学习理论部分,在此给大家推荐一个笔者读过或听过的Transformer理论讲解最清楚的一节课
听完这节课,再配合食用代码更佳
Fig1 Transformer整体结构图
Fig2 Transformer示例图
在下面代码中值得注意的是:
MultiHeadSelfAttention
:所谓多头attention就是多个self-attention,而self-attention= softmax(QK)V / np.sqrt(V)tf.transpose(attention, perm=[0, 2, 1, 3])
:在下面代码中有诸多transpose操作,是为了之后的矩阵乘法形状匹配
class MultiHeadSelfAttention(Layer):
def __init__(self, nb_head, size_per_head, **kwargs):
self.nb_head = nb_head
self.size_per_head = size_per_head
self.output_dim = nb_head * size_per_head
super(MultiHeadSelfAttention, self).__init__(**kwargs)
def build(self, input_shape):
self.WQ = self.add_weight(name='WQ',
shape=(input_shape[0][-1], self.output_dim),
initializer='glorot_uniform',
trainable=True)
self.WK = self.add_weight(name='WK',
shape=(input_shape[1][-1], self.output_dim),
initializer='glorot_uniform',
trainable=True)
self.WV = self.add_weight(name='WV',
shape=(input_shape[2][-1], self.output_dim),
initializer='glorot_uniform',
trainable=True)
super(MultiHeadSelfAttention, self).build(input_shape)
def Mask(self, inputs, seq_len, mode='mul'):
if seq_len == None:
return inputs
else:
mask = K.one_hot(seq_len[:, 0], K.shape(inputs)[1])
mask = 1 - K.cumsum(mask, 1)
for _ in range(len(inputs.shape) - 2):
mask = K.expand_dims(mask, 2)
if mode == 'mul':
return inputs * mask
if mode == 'add':
return inputs - (1 - mask) * 1e12
def call(self, x):
# 如果只传入Q_seq,K_seq,V_seq,那么就不做Mask
# 如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask
if len(x) == 3:
Q_seq, K_seq, V_seq = x
Q_len, V_len = None, None
elif len(x) == 5:
Q_seq, K_seq, V_seq, Q_len, V_len = x
# 对Q、K、V做线性变换
Q_seq = K.dot(Q_seq, self.WQ)
Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3))
K_seq = K.dot(K_seq, self.WK)
K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3))
V_seq = K.dot(V_seq, self.WV)
V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3))
# 计算内积,然后mask,然后softmax
A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.size_per_head ** 0.5
A = K.permute_dimensions(A, (0, 3, 2, 1))
A = self.Mask(A, V_len, 'add')
A = K.permute_dimensions(A, (0, 3, 2, 1))
A = K.softmax(A)
# 输出并mask
O_seq = K.batch_dot(A, V_seq, axes=[3, 2])
O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3))
O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
O_seq = self.Mask(O_seq, Q_len, 'mul')
return O_seq
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1], self.output_dim)
4 Implement a Transformer block as a layer
TransformerBlock
:实现了Fig1左侧部分,即Transformer的Encoder端,代码中值得注意的是
- call中的laynorm,注意在Transformer中是没有用BatchNormalization的,而都是LayerNormalization,Batch是axis=0,即标准化一批数据的一部分,如位置0;Layer是axis=1,即标准化一批数据中的一个个数据;
- call中的inputs + attn_output和out1 + ffn_output,这便是Add残差连接
# 自定义TransformerBlock模块
class TransformerBlock(Layer):
def __init__(self, nb_head, size_per_head, ff_dim=128, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = MultiHeadSelfAttention(nb_head, size_per_head)
self.ffn = Sequential(
[Dense(ff_dim, activation="relu"), Dense(nb_head*size_per_head),]
)
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
def call(self, inputs, training=True):
attn_output = self.att([inputs, inputs, inputs])
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
5 Implement embedding layer
基于sin cos的位置词嵌入
# 基于sincos的pos-embedding
class Position_Embedding(Layer):
def __init__(self, size=None, mode='sum', **kwargs):
self.size = size # 必须为偶数
self.mode = mode
super(Position_Embedding, self).__init__(**kwargs)
def call(self, x):
if (self.size == None) or (self.mode == 'sum'):
self.size = int(x.shape[-1])
batch_size, seq_len = K.shape(x)[0], K.shape(x)[1]
'''计算sin和cos函数中的分母'''
position_j = 1. / K.pow(10000., \
2 * K.arange(self.size / 2, dtype='float32') / self.size)
position_j = K.expand_dims(position_j, 0)
'''计算sin和cos函数中的分子'''
position_i = K.cumsum(K.ones_like(x[:, :, 0]), 1) - 1 # K.arange不支持变长,只好用这种方法生成
position_i = K.expand_dims(position_i, 2)
'''分子分母合起来,得到sin和cos函数中的值'''
position_ij = K.dot(position_i, position_j)
'''将两个向量合并,获得位置编码向量'''
position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2)
if self.mode == 'sum':
return position_ij + x
elif self.mode == 'concat':
return K.concatenate([position_ij, x], 2)
def compute_output_shape(self, input_shape):
if self.mode == 'sum':
return input_shape
elif self.mode == 'concat':
return (input_shape[0], input_shape[1], input_shape[2] + self.size)
当然你也可以
搭建两个独立的embedding层,一个用于tokens,另一个用于token index(positions)
TokenAndPositionEmbedding
:这里与原始论文中的Transformer不同的是pos-embedding的不同,原始使用的sincos构造的,本例中直接使用索引
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
6 Download and prepare dataset
max_features = 20000
maxlen = 80
batch_size = 32
print('Loading data...')
'''imdb数据集包含5万条来自网络电影数据库的评论,其中2万五千条为训练数据,两万五千条为测试数据,正负评论各占50%,lable是1和0
load_data()函数自动加载分割好的训练集和测试集
x_train, labels_train = f['x_train'], f['y_train']
x_test, labels_test = f['x_test'], f['y_test']
num_words指的是只保留训练集词频rank在前num_words个的词'''
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')
'''将所有的评论都统一成一样的长度,长度为maxlen'''
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
7 Create classifier model using transformer layer
Transformer层对我们输入的序列每个timestamp都会有一个向量输出。此处我们取所有timestamp的平均值,将它喂给前向网络来进行文本分类
S_inputs = Input(shape=(None,), dtype='int32')
'''将输入的字符序列随机编码成embedding向量'''
embeddings = Embedding(max_features, 128)(S_inputs)
embeddings = Position_Embedding()(embeddings) # 增加Position_Embedding能轻微提高准确率
# O_seq = MultiHeadSelfAttention(8,16)([embeddings,embeddings,embeddings])
O_seq = TransformerBlock(8,16)(embeddings)
O_seq = BatchNormalization()(O_seq)
O_seq = TransformerBlock(8,16)(O_seq)
O_seq = GlobalAveragePooling1D()(O_seq) #Transformer层对我们输入的序列每个timestamp都会有一个向量输出。此处我们取所有timestamp的平均值
O_seq = Dropout(0.5)(O_seq)
outputs = Dense(1, activation='sigmoid')(O_seq)
model = Model(inputs=S_inputs, outputs=outputs)
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
8 Train and Evaluate
训练和评估效果
print('Train...')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=5,
validation_data=(x_test, y_test))
网络结构为:
Using TensorFlow backend.
Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) (None, None) 0
_________________________________________________________________
embedding_1 (Embedding) (None, None, 128) 2560000
_________________________________________________________________
position__embedding_1 (Posit (None, None, 128) 0
_________________________________________________________________
transformer_block_1 (Transfo (None, None, 128) 0
_________________________________________________________________
batch_normalization_1 (Batch (None, None, 128) 512
_________________________________________________________________
transformer_block_2 (Transfo (None, None, 128) 0
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128) 0
_________________________________________________________________
dropout_5 (Dropout) (None, 128) 0
_________________________________________________________________
dense_5 (Dense) (None, 1) 129
=================================================================
Total params: 2,560,641
Trainable params: 2,560,385
Non-trainable params: 256
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
25000/25000 [==============================] - 243s 10ms/step - loss: 0.6753 - acc: 0.6006 - val_loss: 0.5253 - val_acc: 0.7857
Epoch 2/5
25000/25000 [==============================] - 221s 9ms/step - loss: 0.4461 - acc: 0.8016 - val_loss: 0.3850 - val_acc: 0.8318
Epoch 3/5
25000/25000 [==============================] - 223s 9ms/step - loss: 0.3391 - acc: 0.8584 - val_loss: 0.3517 - val_acc: 0.8447
Epoch 4/5
25000/25000 [==============================] - 222s 9ms/step - loss: 0.2818 - acc: 0.8869 - val_loss: 0.3460 - val_acc: 0.8486
Epoch 5/5
25000/25000 [==============================] - 221s 9ms/step - loss: 0.2426 - acc: 0.9066 - val_loss: 0.3481 - val_acc: 0.8483