import matplotlib as mpl
import matplotlib. pyplot as plt
% matplotlib inline
import sklearn
import numpy as np
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print ( tf. __version__)
print ( sys. version_info)
2.0.0
sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)
for i in mpl, np, pd, sklearn, tf, keras:
print ( i. __name__, i. __version__)
matplotlib 3.1.3
numpy 1.18.1
pandas 1.0.3
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf
gpus = tf. config. experimental. list_physical_devices( 'GPU' )
print ( gpus)
for gpu in gpus:
tf. config. experimental. set_memory_growth( gpu, True )
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
en_spa_file_path = './spa.txt'
en_spa_file_path
'./spa.txt'
import unicodedata
def unicode_to_ascii ( s) :
return '' . join( c for c in unicodedata. normalize( 'NFD' , s) if unicodedata. category( c) != 'Mn' )
en_sentence = 'Then what?'
sp_sentence = '¿Entonces qué?'
print ( unicode_to_ascii( en_sentence) )
print ( unicode_to_ascii( sp_sentence) )
Then what?
¿Entonces que?
import re
def preprocess_sentence ( s) :
s = unicode_to_ascii( s. lower( ) . strip( ) )
s = re. sub( r"([?.!,¿])" , r"\1" , s)
s = re. sub( r'[" "]+' , " " , s)
s = re. sub( r'[^a-zA-Z?.!,¿]' , " " , s)
s = s. rstrip( ) . rstrip( )
s = '<start>' + s + '<end>'
return s
print ( preprocess_sentence( en_sentence) )
print ( preprocess_sentence( sp_sentence) )
<start>then what?<end>
<start>¿entonces que?<end>
def parse_data ( filename) :
lines = open ( filename, mode= 'r' , encoding = 'UTF-8' ) . read( ) . strip( ) . split( '\n' )
sentence_pairs = [ line. split( '\t' ) for line in lines]
preprocessed_sentence_pairs = [
( preprocess_sentence( en) , preprocess_sentence( sp) ) for en, sp in sentence_pairs]
return zip ( * preprocessed_sentence_pairs)
a = [ ( 1 , 2 ) , ( 3 , 4 ) , ( 5 , 6 ) ]
c, d = zip ( * a)
print ( c, d)
(1, 3, 5) (2, 4, 6)
en_dataset, sp_dataset = parse_data( en_spa_file_path)
print ( en_dataset[ - 1 ] )
<start>if you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.<end>
print ( sp_dataset[ - 1 ] )
<start>si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.<end>
词语到ID转换 tokenizer
def tokenizer ( lang) :
lang_tokenizer = keras. preprocessing. text. Tokenizer(
num_words= None , filters = '' , split= ' ' )
lang_tokenizer. fit_on_texts( lang)
tensor = lang_tokenizer. texts_to_sequences( lang)
tensor = keras. preprocessing. sequence. pad_sequences( tensor, padding = 'post' )
return tensor, lang_tokenizer
input_tensor, input_tokenizer = tokenizer( sp_dataset[ 0 : 30000 ] )
output_tensor, output_tokenizer = tokenizer( en_dataset[ 0 : 30000 ] )
input_tensor[ 0 : 1 ]
array([[6722, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0]])
output_tensor[ 0 : 1 ]
array([[2224, 0, 0, 0, 0, 0, 0]])
def max_lenth ( tensor) :
return max ( len ( t) for t in tensor)
max_lenth_input = max_lenth( input_tensor)
max_lenth_output = max_lenth( output_tensor)
print ( max_lenth_input, max_lenth_output)
12 7
from sklearn. model_selection import train_test_split
input_train, input_eval, output_train, output_eval = train_test_split( input_tensor, output_tensor, test_size = 0.2 )
len ( input_train) , len ( input_eval) , len ( output_train) , len ( output_eval)
(24000, 6000, 24000, 6000)
def convert ( example, tokenizer) :
for t in example:
if t != 0 :
print ( '%d-->%s' % ( t, tokenizer. index_word[ t] ) )
convert( input_train[ 0 ] , input_tokenizer)
print ( )
7--><start>el
12-->no
39-->tiene
9853-->remordimiento.<end>
convert( output_train[ 0 ] , output_tokenizer)
8--><start>he
50-->has
95-->no
4062-->remorse.<end>
batch_size = 64
epochs = 20
def make_dataset ( input_tensor, output_tensor,
batch_size, epochs, shuffle) :
dataset = tf. data. Dataset. from_tensor_slices(
( input_tensor, output_tensor) )
if shuffle:
dataset = dataset. shuffle( 30000 )
dataset = dataset. repeat( epochs) . batch(
batch_size, drop_remainder = True )
return dataset
train_dataset = make_dataset( input_train, output_train, batch_size, epochs, True )
eval_dataset = make_dataset( input_eval, output_eval, batch_size, 1 , True )
for x, y in train_dataset. take( 1 ) :
print ( x. shape)
print ( y. shape)
(64, 12)
(64, 7)
embedding_units = 256
units = 1024
input_vocab_size = len ( input_tokenizer. word_index) + 1
output_vocab_size = len ( output_tokenizer. word_index) + 1
input_vocab_size
14551
output_vocab_size
8023
Encoder
class Encoder ( tf. keras. Model) :
def __init__ ( self, vocab_size, embbeding_units, encoding_units, batch_size) :
super ( Encoder, self) . __init__( )
self. batch_size = batch_size
self. encoding_units = encoding_units
self. embedding = tf. keras. layers. Embedding( vocab_size, embedding_units)
self. gru = tf. keras. layers. GRU( self. encoding_units,
return_sequences= True ,
return_state= True ,
recurrent_initializer= 'glorot_uniform' )
def call ( self, x, hidden) :
x = self. embedding( x)
output, state = self. gru( x, initial_state = hidden)
return output, state
def initialize_hidden_state ( self) :
return tf. zeros( ( self. batch_size, self. encoding_units) )
tf. keras. Model( )
<tensorflow.python.keras.engine.training.Model at 0x1eacf1d9128>
encoder = Encoder( input_vocab_size, embedding_units, units, batch_size)
sample_hidden = encoder. initialize_hidden_state( )
sample_hidden. shape
TensorShape([64, 1024])
sample_output, sample_hidden = encoder( x, sample_hidden)
print ( sample_output. shape)
print ( sample_hidden. shape)
(64, 12, 1024)
(64, 1024)
Attention
class BahdanauAttention ( keras. Model) :
def __init__ ( self, units) :
super ( BahdanauAttention, self) . __init__( )
self. W1 = keras. layers. Dense( units)
self. W2 = keras. layers. Dense( units)
self. V = keras. layers. Dense( 1 )
def call ( self, decoder_hidden, encoder_outputs) :
decoder_hidden_with_time_axis = tf. expand_dims( decoder_hidden, 1 )
score = self. V(
tf. nn. tanh(
self. W1( encoder_outputs) + self. W2( decoder_hidden_with_time_axis) ) )
attention_weights = tf. nn. softmax( score, axis= 1 )
context_vector = attention_weights+ encoder_outputs
context_vector = tf. reduce_sum( context_vector, axis= 1 )
return context_vector, attention_weights
attention_model = BahdanauAttention( units = 10 )
attention_result, attention_weights = attention_model( sample_hidden, sample_output)
attention_result. shape, attention_weights. shape
(TensorShape([64, 1024]), TensorShape([64, 12, 1]))
Decoder
class Decoder ( keras. Model) :
def __init__ ( self, vocab_size, embedding_dim, decoding_units, batch_size) :
super ( Decoder, self) . __init__( )
self. batch_size = batch_size
self. decoding_units = decoding_units
self. embedding = keras. layers. Embedding( vocab_size, embedding_units)
self. gru = keras. layers. GRU( self. decoding_units, return_sequences= True ,
return_state= True , recurrent_initializer= 'glorot_uniform' )
self. fc = keras. layers. Dense( vocab_size)
self. attention = BahdanauAttention( self. decoding_units)
def call ( self, x, hidden, encoding_outputs) :
context_vector, attention_weights = self. attention( hidden, encoding_outputs)
x = self. embedding( x)
combined_x = tf. concat( [ tf. expand_dims( context_vector, 1 ) , x] , axis= - 1 )
output, state = self. gru( combined_x)
output = tf. reshape( output, ( - 1 , output. shape[ 2 ] ) )
output = self. fc( output)
return output, state, attention_weights
decoder = Decoder( output_vocab_size, embedding_units, units, batch_size)
outputs = decoder( tf. random. uniform( ( batch_size, 1 ) ) , sample_hidden, sample_output)
decoder_output, decoder_hidden, decoder_aw = outputs
decoder_output. shape, decoder_hidden. shape, decoder_aw. shape
(TensorShape([64, 8023]), TensorShape([64, 1024]), TensorShape([64, 12, 1]))
loss,optimize
optimizer = keras. optimizers. Adam( )
loss_object = keras. losses. SparseCategoricalCrossentropy( from_logits= True , reduction= 'none' )
def loss_function ( real, pred) :
mask = tf. math. logical_not( tf. math. equal( real, 0 ) )
loss_ = loss_object( real, pred)
mask = tf. cast( mask, dtype= loss_. dtype)
loss_ *= mask
return tf. reduce_mean( loss_)
训练
@ tf. function
def train_step ( inp, targ, encoding_hidden) :
loss = 0
with tf. GradientTape( ) as tape:
encoding_outputs, encoding_hidden = encoder( inp, encoding_hidden)
decoding_hidden = encoding_hidden
for t in range ( 0 , targ. shape[ 1 ] - 1 ) :
decoding_input = tf. expand_dims( targ[ : , t] , 1 )
predictions , decoding_hidden, _ = decoder( decoding_input, decoding_hidden, encoding_outputs)
loss += loss_function( targ[ : , t+ 1 ] , predictions)
batch_loss = loss/ int ( targ. shape[ 0 ] )
varibles = encoder. trainable_variables + decoder. trainable_variables
gradients = tape. gradient( loss, varibles)
optimizer. apply_gradients( zip ( gradients, varibles) )
return batch_loss
epochs = 10
steps_per_epoch = len ( input_tensor) // batch_size
for epoch in range ( epochs) :
start = time. time( )
encoding_hidden = encoder. initialize_hidden_state( )
total_loss = 0
for ( batch, ( inp, targ) ) in enumerate ( train_dataset. take( steps_per_epoch) ) :
batch_loss = train_step( inp, targ, encoding_hidden)
total_loss += batch_loss
if batch % 100 == 0 :
print ( 'Epoch{} Batch{} Loss{:4f}' . format ( epoch+ 1 , batch, batch_loss. numpy( ) ) )
print ( 'Epoch{} Loss{:4f}' . format ( epoch+ 1 , total_loss/ steps_per_epoch) )
print ( 'Time take for 1 epoch{}sec\n' . format ( time. time( ) - start) )
Epoch1 Batch0 Loss0.431429
Epoch1 Batch100 Loss0.342188
Epoch1 Batch200 Loss0.326750
Epoch1 Batch300 Loss0.356125
Epoch1 Batch400 Loss0.340040
Epoch1 Loss0.332392
Time take for 1 epoch52.96938753128052sec
Epoch2 Batch0 Loss0.303429
Epoch2 Batch100 Loss0.345815
Epoch2 Batch200 Loss0.320014
Epoch2 Batch300 Loss0.339335
Epoch2 Batch400 Loss0.307118
Epoch2 Loss0.323753
Time take for 1 epoch39.86134743690491sec
Epoch3 Batch0 Loss0.320290
Epoch3 Batch100 Loss0.322101
Epoch3 Batch200 Loss0.292475
Epoch3 Batch300 Loss0.290332
Epoch3 Batch400 Loss0.267710
Epoch3 Loss0.295802
Time take for 1 epoch39.10341835021973sec
Epoch4 Batch0 Loss0.267328
Epoch4 Batch100 Loss0.275814
Epoch4 Batch200 Loss0.255140
Epoch4 Batch300 Loss0.242726
Epoch4 Batch400 Loss0.249246
Epoch4 Loss0.254090
Time take for 1 epoch41.05219793319702sec
Epoch5 Batch0 Loss0.225599
Epoch5 Batch100 Loss0.221862
Epoch5 Batch200 Loss0.233836
Epoch5 Batch300 Loss0.218250
Epoch5 Batch400 Loss0.209672
Epoch5 Loss0.218951
Time take for 1 epoch40.538575172424316sec
Epoch6 Batch0 Loss0.211435
Epoch6 Batch100 Loss0.191422
Epoch6 Batch200 Loss0.179851
Epoch6 Batch300 Loss0.194333
Epoch6 Batch400 Loss0.169027
Epoch6 Loss0.187684
Time take for 1 epoch42.59706950187683sec
Epoch7 Batch0 Loss0.178268
Epoch7 Batch100 Loss0.166835
Epoch7 Batch200 Loss0.179573
Epoch7 Batch300 Loss0.151579
Epoch7 Batch400 Loss0.156302
Epoch7 Loss0.158616
Time take for 1 epoch40.71410608291626sec
Epoch8 Batch0 Loss0.136262
Epoch8 Batch100 Loss0.139565
Epoch8 Batch200 Loss0.135129
Epoch8 Batch300 Loss0.132116
Epoch8 Batch400 Loss0.108007
Epoch8 Loss0.132370
Time take for 1 epoch42.345741987228394sec
Epoch9 Batch0 Loss0.114414
Epoch9 Batch100 Loss0.107000
Epoch9 Batch200 Loss0.110198
Epoch9 Batch300 Loss0.115807
Epoch9 Batch400 Loss0.102093
Epoch9 Loss0.106728
Time take for 1 epoch40.09575963020325sec
Epoch10 Batch0 Loss0.083485
Epoch10 Batch100 Loss0.088291
Epoch10 Batch200 Loss0.097636
Epoch10 Batch300 Loss0.081532
Epoch10 Batch400 Loss0.062543
Epoch10 Loss0.084491
Time take for 1 epoch41.05120515823364sec
预测
def evaluate ( input_sentence) :
attention_matrix = np. zeros( ( max_lenth_output, max_lenth_input) )
input_sentence = preprocess_sentence( input_sentence)
inputs = [ input_tokenizer. word_index[ token] for token in input_sentence. split( ' ' ) ]
inputs = keras. preprocessing. sequence. pad_sequences( [ inputs] , maxlen= max_lenth_input, padding= 'post' )
inputs = tf. convert_to_tensor( inputs)
results = ''
encoding_hidden = tf. zeros( ( 1 , units) )
encoding_outputs , encoding_hidden = encoder( inputs, encoding_hidden)
decoding_hidden = encoding_hidden
decoding_input = tf. expand_dims( [ output_tokenizer. word_index[ '<start>' ] ] , 0 )
for t in range ( max_lenth_output) :
predictions, decoding_hidden, attention_weights = decoder( decoding_input, decoding_hidden, encoding_outputs)
attention_weights = tf. reshape( attention_weights, ( - 1 , ) )
attention_matrix[ t] = attention_weights. numpy( )
predicted_id = tf. argmax( predictions[ 0 ] ) . numpy( )
results += output_tokenizer. index_word[ predicted_id] + ' '
if output_tokenizer. index_word[ predicted_id] == '<end>' :
return results, input_sentence, attention_matrix
decoding_input = tf. expand_dims( [ predicted_id] , 0 )
return results, input_sentence, attention_matrix
def plot_attention ( attention_matrix, input_sentence, predicted_sentence) :
fig = plt. figure( figsize= ( 10 , 10 ) )
ax = fig. add_subplot( 1 , 1 , 1 )
ax. matshow( attention_matrix, cmap = 'viridis' )
font_dict = { 'fontsize' : 14 }
ax. set_xticklabels( [ '' ] + input_sentence, fontdict= font_dict, rotation = 90 )
ax. set_yticklabels( [ '' ] + predicted_sentence, fontdict= font_dict)
plt. show( )
def translate ( input_sentence) :
results, input_sentence, attention_matrix = evaluate( input_sentence)
print ( 'Input: %s' % ( input_sentence) )
print ( 'predicted translation: %s' % ( results) )
attention_matrix = attention_matrix[ : len ( results. split( ' ' ) ) , : len ( input_sentence. split( ' ' ) ) ]
plot_attention( attention_matrix, input_sentence. split( ' ' ) , results. split( ' ' ) )
translate( u'¿Todavía estás en casa?' )
Input: <start>¿todavia estas en casa?<end>
predicted translation: home?<end> now?<end> now?<end> now?<end> now?<end> now?<end> now?<end>