Seq2Seq_Attention

最新推荐文章于 2025-03-14 19:51:38 发布

原创最新推荐文章于 2025-03-14 19:51:38 发布 · 237 阅读

CC 4.0 BY-SA版权

本文介绍了一个基于神经网络的英西互译模型，利用TensorFlow和Keras实现，涵盖了数据预处理、模型构建、训练及评估过程。模型采用编码器-解码器结构，加入注意力机制，有效提升翻译质量。

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import numpy as np
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
print(sys.version_info)

2.0.0
sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)

for i in mpl,np,pd,sklearn,tf,keras:
    print(i.__name__,i.__version__)

matplotlib 3.1.3
numpy 1.18.1
pandas 1.0.3
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf

gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

##1, preprocessing data
##2, build model
##2.1 encoder
##2.2 attention
##2.3 decoder
## 2.4 loss && optimizer
## 2.5 train
##3, evaluation
##3.1 given sentence,return translated results
##3.2 visualize results(attention)

en_spa_file_path = './spa.txt'

en_spa_file_path

'./spa.txt'

import unicodedata
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c) != 'Mn') #去掉重音

en_sentence = 'Then what?'
sp_sentence = '¿Entonces qué?'
print(unicode_to_ascii(en_sentence))
print(unicode_to_ascii(sp_sentence))

Then what?
¿Entonces que?

import re
def preprocess_sentence(s):
    s = unicode_to_ascii(s.lower().strip())
    #标点符号前后都加空格
    s = re.sub(r"([?.!,¿])", r"\1", s) 
    #d多余的空格变成一个空格
    s = re.sub(r'[" "]+'," ", s)
    #除了标点符号和字母外都是空格
    s = re.sub(r'[^a-zA-Z?.!,¿]'," ", s)
    # 去掉前后空格
    s = s.rstrip().rstrip()
    s = '<start>' + s + '<end>'
    return s

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))

<start>then what?<end>
<start>¿entonces que?<end>

def parse_data(filename):
    lines = open(filename,mode='r',encoding = 'UTF-8').read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]
    preprocessed_sentence_pairs = [
        (preprocess_sentence(en),preprocess_sentence(sp)) for en,sp in sentence_pairs]
    return zip (*preprocessed_sentence_pairs)

a = [(1,2),(3,4),(5,6)]
c,d = zip(*a)
print(c,d)

(1, 3, 5) (2, 4, 6)

en_dataset,sp_dataset = parse_data(en_spa_file_path)

print(en_dataset[-1])

<start>if you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.<end>

print(sp_dataset[-1])

<start>si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.<end>

词语到ID转换 tokenizer

def tokenizer(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(
        num_words=None,filters = '',split=' ')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang) #文本转成id
    tensor = keras.preprocessing.sequence.pad_sequences(tensor,padding = 'post') #在后面做padding
    return tensor,lang_tokenizer

input_tensor,input_tokenizer = tokenizer(sp_dataset[0:30000])
output_tensor,output_tokenizer = tokenizer(en_dataset[0:30000])

input_tensor[0:1]

array([[6722,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]])

output_tensor[0:1]

array([[2224,    0,    0,    0,    0,    0,    0]])

def max_lenth(tensor):
    return max(len(t) for t in tensor)

max_lenth_input = max_lenth(input_tensor)
max_lenth_output = max_lenth(output_tensor)
print(max_lenth_input,max_lenth_output)

12 7

from sklearn.model_selection import train_test_split
input_train,input_eval,output_train,output_eval = train_test_split(input_tensor,output_tensor,test_size = 0.2)

len(input_train),len(input_eval),len(output_train),len(output_eval)

(24000, 6000, 24000, 6000)

## 对于example中的每个元素做判断,如果不等于0就用tokenizer得到字符串

def convert(example,tokenizer):
    for t in example:
        if t != 0:
            print('%d-->%s' %(t,tokenizer.index_word[t]))

convert(input_train[0],input_tokenizer)
print()

7--><start>el
12-->no
39-->tiene
9853-->remordimiento.<end>

convert(output_train[0],output_tokenizer)

8--><start>he
50-->has
95-->no
4062-->remorse.<end>

batch_size = 64
epochs = 20

def make_dataset(input_tensor, output_tensor,
                 batch_size,epochs,shuffle):
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor, output_tensor))
    if shuffle:
        dataset = dataset.shuffle(30000)
    dataset = dataset.repeat(epochs).batch(
        batch_size,drop_remainder = True)
    return dataset

train_dataset = make_dataset(input_train, output_train,batch_size,epochs,True)
eval_dataset = make_dataset(input_eval, output_eval,batch_size,1,True)

for x,y in train_dataset.take(1):
    print (x.shape)
    print (y.shape)

(64, 12)
(64, 7)

embedding_units = 256
#embedding_dim = 20000
units = 1024
input_vocab_size = len(input_tokenizer.word_index)+1
output_vocab_size = len(output_tokenizer.word_index)+1

input_vocab_size

output_vocab_size

Encoder

class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embbeding_units,encoding_units,batch_size): # encoding_unites RNN的size
        super(Encoder,self).__init__()
        self.batch_size = batch_size
        self.encoding_units = encoding_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_units)
        self.gru = tf.keras.layers.GRU(self.encoding_units,
                                       return_sequences= True,# 需要每一步的输出
                                       return_state= True,#状态
                                       recurrent_initializer='glorot_uniform')#权值初始化
    
    def call(self,x,hidden):
        x = self.embedding(x)
        output,state = self.gru(x,initial_state = hidden)
        return output,state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size,self.encoding_units))

tf.keras.Model()

<tensorflow.python.keras.engine.training.Model at 0x1eacf1d9128>

encoder = Encoder(input_vocab_size,embedding_units,units,batch_size)

sample_hidden = encoder.initialize_hidden_state()

sample_hidden.shape

TensorShape([64, 1024])

sample_output, sample_hidden = encoder(x,sample_hidden)

print (sample_output.shape)
print (sample_hidden.shape)

(64, 12, 1024)
(64, 1024)

Attention

class BahdanauAttention(keras.Model):
    def __init__(self,units):
        super (BahdanauAttention,self).__init__()
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
        
    def call(self,decoder_hidden,encoder_outputs): #负责传入参数
        #decoder_hidden.shape:(batch_size,units)
        #encoder_outputs.shape:(batch_size,length,units))
        decoder_hidden_with_time_axis = tf.expand_dims(decoder_hidden,1)
        
        # before V: (batch_size,length,units)
        # after V: (batch_size,length,1)
        score = self.V(
            tf.nn.tanh(
                self.W1(encoder_outputs)+self.W2(decoder_hidden_with_time_axis)))
        # shape:(batch_size,length,1)
        attention_weights = tf.nn.softmax(score,axis=1)
        # # context_vector.shape:(batch_size,length,units)
        context_vector = attention_weights+ encoder_outputs
        
        # # context_vector.shape:(batch_size,units)
        context_vector = tf.reduce_sum(context_vector,axis=1)
        
        return context_vector,attention_weights

attention_model = BahdanauAttention(units = 10)

attention_result,attention_weights = attention_model(sample_hidden,sample_output)

attention_result.shape,attention_weights.shape

(TensorShape([64, 1024]), TensorShape([64, 12, 1]))

Decoder

class Decoder(keras.Model):
    def __init__(self,vocab_size,embedding_dim,decoding_units,batch_size):
        super (Decoder,self).__init__()
        self.batch_size = batch_size
        self.decoding_units = decoding_units
        self.embedding = keras.layers.Embedding(vocab_size,embedding_units)
        self.gru = keras.layers.GRU(self.decoding_units,return_sequences=True,
                                    return_state=True,recurrent_initializer='glorot_uniform')
        self.fc = keras.layers.Dense(vocab_size)
        
        self.attention = BahdanauAttention(self.decoding_units)
    
    def call(self,x,hidden,encoding_outputs):
        #context_vector.shape :(batch_size,units)
        context_vector,attention_weights = self.attention(hidden,encoding_outputs)
        
        #before embedding : x.shape: (batch_szie,1)
        # after embedding: x.shape: (batch_szie,1,embedding_units)
        x = self.embedding(x)
        combined_x = tf.concat([tf.expand_dims(context_vector,1),x],axis=-1)
        
        #output.shape : [batch_size,1,decoding_units]
        #state.shape: [batch_size,decoding_units]
        output,state = self.gru(combined_x)
        
        #output.shape : [batch_size,decoding_units]
        output = tf.reshape(output,(-1,output.shape[2]))
        
        #output.shape: [batch_size,vocab_size]
        output = self.fc(output)
        
        return output,state,attention_weights

decoder = Decoder(output_vocab_size,embedding_units,units,batch_size)
outputs = decoder(tf.random.uniform((batch_size,1)),sample_hidden,sample_output)

decoder_output,decoder_hidden,decoder_aw = outputs

decoder_output.shape,decoder_hidden.shape,decoder_aw.shape

(TensorShape([64, 8023]), TensorShape([64, 1024]), TensorShape([64, 12, 1]))

loss,optimize

optimizer = keras.optimizers.Adam()

loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')#reduction损失函数聚合 ,'none'是小写

def loss_function(real,pred):
    mask = tf.math.logical_not(tf.math.equal(real,0)) #判断是否为0,为0返回True. logical_not 取反
    loss_ = loss_object(real,pred)
    
    mask = tf.cast(mask,dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

训练

@ tf.function ##标注,加速
def train_step(inp,targ,encoding_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        encoding_outputs,encoding_hidden = encoder(inp,encoding_hidden)
        decoding_hidden = encoding_hidden
        
        #eg: <start> i am here <end>
        #1, start->I
        #2 I ->am
        #3 am -> here
        #4 here -><end>
        for t in range(0,targ.shape[1]-1):
            decoding_input = tf.expand_dims(targ[:,t],1)
            
            predictions , decoding_hidden, _ = decoder(decoding_input,decoding_hidden,encoding_outputs)
            loss += loss_function(targ[:,t+1],predictions)
    batch_loss = loss/int(targ.shape[0])
    varibles = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss,varibles)
    optimizer.apply_gradients(zip(gradients,varibles))
    return batch_loss

epochs = 10
steps_per_epoch = len(input_tensor)//batch_size

for epoch in range(epochs):
    start = time.time()
    
    encoding_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch,(inp,targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp,targ,encoding_hidden)
        total_loss +=batch_loss
        
        if batch % 100 ==0:
            print('Epoch{} Batch{} Loss{:4f}'.format(epoch+1,batch,batch_loss.numpy()))
    
    print('Epoch{} Loss{:4f}'.format(epoch+1,total_loss/steps_per_epoch))
    print('Time take for 1 epoch{}sec\n'.format(time.time()-start))

Epoch1 Batch0 Loss0.431429
Epoch1 Batch100 Loss0.342188
Epoch1 Batch200 Loss0.326750
Epoch1 Batch300 Loss0.356125
Epoch1 Batch400 Loss0.340040
Epoch1 Loss0.332392
Time take for 1 epoch52.96938753128052sec

Epoch2 Batch0 Loss0.303429
Epoch2 Batch100 Loss0.345815
Epoch2 Batch200 Loss0.320014
Epoch2 Batch300 Loss0.339335
Epoch2 Batch400 Loss0.307118
Epoch2 Loss0.323753
Time take for 1 epoch39.86134743690491sec

Epoch3 Batch0 Loss0.320290
Epoch3 Batch100 Loss0.322101
Epoch3 Batch200 Loss0.292475
Epoch3 Batch300 Loss0.290332
Epoch3 Batch400 Loss0.267710
Epoch3 Loss0.295802
Time take for 1 epoch39.10341835021973sec

Epoch4 Batch0 Loss0.267328
Epoch4 Batch100 Loss0.275814
Epoch4 Batch200 Loss0.255140
Epoch4 Batch300 Loss0.242726
Epoch4 Batch400 Loss0.249246
Epoch4 Loss0.254090
Time take for 1 epoch41.05219793319702sec

Epoch5 Batch0 Loss0.225599
Epoch5 Batch100 Loss0.221862
Epoch5 Batch200 Loss0.233836
Epoch5 Batch300 Loss0.218250
Epoch5 Batch400 Loss0.209672
Epoch5 Loss0.218951
Time take for 1 epoch40.538575172424316sec

Epoch6 Batch0 Loss0.211435
Epoch6 Batch100 Loss0.191422
Epoch6 Batch200 Loss0.179851
Epoch6 Batch300 Loss0.194333
Epoch6 Batch400 Loss0.169027
Epoch6 Loss0.187684
Time take for 1 epoch42.59706950187683sec

Epoch7 Batch0 Loss0.178268
Epoch7 Batch100 Loss0.166835
Epoch7 Batch200 Loss0.179573
Epoch7 Batch300 Loss0.151579
Epoch7 Batch400 Loss0.156302
Epoch7 Loss0.158616
Time take for 1 epoch40.71410608291626sec

Epoch8 Batch0 Loss0.136262
Epoch8 Batch100 Loss0.139565
Epoch8 Batch200 Loss0.135129
Epoch8 Batch300 Loss0.132116
Epoch8 Batch400 Loss0.108007
Epoch8 Loss0.132370
Time take for 1 epoch42.345741987228394sec

Epoch9 Batch0 Loss0.114414
Epoch9 Batch100 Loss0.107000
Epoch9 Batch200 Loss0.110198
Epoch9 Batch300 Loss0.115807
Epoch9 Batch400 Loss0.102093
Epoch9 Loss0.106728
Time take for 1 epoch40.09575963020325sec

Epoch10 Batch0 Loss0.083485
Epoch10 Batch100 Loss0.088291
Epoch10 Batch200 Loss0.097636
Epoch10 Batch300 Loss0.081532
Epoch10 Batch400 Loss0.062543
Epoch10 Loss0.084491
Time take for 1 epoch41.05120515823364sec

预测

def evaluate(input_sentence):
    attention_matrix = np.zeros((max_lenth_output,max_lenth_input))
    input_sentence = preprocess_sentence(input_sentence)
    
    inputs = [input_tokenizer.word_index[token] for token in input_sentence.split(' ')]
    inputs = keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_lenth_input,padding='post')
    inputs = tf.convert_to_tensor(inputs)
    
    results = ''
    #encoding_hidden = encoder.initialize_hidden_state()
    encoding_hidden = tf.zeros((1,units))
    
    encoding_outputs , encoding_hidden = encoder(inputs,encoding_hidden)
    decoding_hidden = encoding_hidden
    
    ## eg:<start> -->A
    # A -->B --> C --> D
    
    #decoding_input.shape:(1,1)
    decoding_input = tf.expand_dims([output_tokenizer.word_index['<start>']],0)
    for t in range(max_lenth_output):
        predictions,decoding_hidden,attention_weights = decoder(decoding_input,decoding_hidden,encoding_outputs)
        
        # attention_weights.shape: (batch_size,input_length,1) (1,16,1)
        attention_weights = tf.reshape(attention_weights,(-1,))
        attention_matrix[t] = attention_weights.numpy()
        
        # predictions.shape :(batch_size,vocab_size) (1,4935)
        predicted_id = tf.argmax(predictions[0]).numpy()
        
        results += output_tokenizer.index_word[predicted_id] + ' '
        
        if output_tokenizer.index_word[predicted_id] == '<end>':
            return results,input_sentence,attention_matrix
        
        decoding_input = tf.expand_dims([predicted_id],0)
    return results,input_sentence,attention_matrix

def plot_attention(attention_matrix,input_sentence,predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1,1,1)
    ax.matshow(attention_matrix,cmap = 'viridis')
    
    font_dict = {'fontsize':14}
    ax.set_xticklabels([''] + input_sentence,fontdict=font_dict,rotation = 90)
    ax.set_yticklabels([''] + predicted_sentence,fontdict=font_dict)
    plt.show()

def translate(input_sentence):
    results,input_sentence,attention_matrix = evaluate(input_sentence)
    
    print('Input: %s' %(input_sentence))
    print('predicted translation: %s' %(results))
    
    attention_matrix = attention_matrix[:len(results.split(' ')),:len(input_sentence.split(' '))]
    
    plot_attention(attention_matrix,input_sentence.split(' '),results.split(' '))

translate(u'¿Todavía estás en casa?')

Input: <start>¿todavia estas en casa?<end>
predicted translation: home?<end> now?<end> now?<end> now?<end> now?<end> now?<end> now?<end>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-TTGBEDji-1590139652689)(output_61_1.png)]