### 使用 TensorFlow 实现 LSTM Seq2Seq 模型进行预测
为了使用 TensorFlow 构建并应用 LSTM 基础的 Sequence-to-Sequence (Seq2Seq) 模型来进行预测,主要涉及几个方面的工作:数据准备、模型定义、编译配置、训练过程以及最终利用训练好的模型执行预测操作。
#### 数据准备
在开始之前,确保已经准备好用于训练的数据集,并进行了必要的预处理工作。这通常包括但不限于清洗文本、分词化、创建词汇表索引映射等步骤[^1]:
```python
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
# 加载和清理数据...
def preprocess_data(english_sentences, german_sentences):
# 清洗句子函数...
# 创建词汇表...
return input_tensor_train, target_tensor_train, inp_lang_tokenizer, targ_lang_tokenizer
input_tensor_train, target_tensor_train, inp_lang_tokenizer, targ_lang_tokenizer = preprocess_data(en_sents, de_sents)
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
```
#### 定义编码器与解码器结构
接下来是构建编码器-解码器架构的核心部分,在此案例中采用双向LSTM作为编码层的一部分来捕捉输入序列中的上下文信息;而注意力机制则被引入到解码阶段以增强性能表现:
```python
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.bi_lstm = tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(self.enc_units,
return_sequences=True,
return_state=True))
def call(self, x, hidden):
x = self.embedding(x)
output, forward_h, forward_c, backward_h, backward_c = self.bi_lstm(x, initial_state=hidden)
state_h = tf.concat([forward_h, backward_h], axis=-1)
state_c = tf.concat([forward_c, backward_c], axis=-1)
return output, [state_h, state_c]
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
```
对于解码端,则需特别注意加入Attention Layer以便更好地理解源语句特征[^2]:
```python
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
...
class Decoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
...
decoder = Decoder(vocab_tar_size, embedding_dim, units*2, BATCH_SIZE) # 注意力机制使得状态维度翻倍
attention_layer = BahdanauAttention(units)
```
#### 编译设置及训练流程
完成上述组件的设计之后就可以着手于整个网络框架的整体组装及其优化参数的选择了。此处选用Adam Optimizer配合自适应学习率调整策略能够有效提升收敛速度[^3]:
```python
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
encoder=encoder,
decoder=decoder)
@tf.function
def train_step(inp, targ, enc_hidden):
loss = 0
with tf.GradientTape() as tape:
...
for epoch in range(EPOCHS):
start = time.time()
enc_hidden = encoder.initialize_hidden_state()
total_loss = 0
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(inp, targ, enc_hidden)
total_loss += batch_loss
if (epoch + 1) % 2 == 0:
checkpoint.save(file_prefix = checkpoint_prefix)
print('Time taken for {} epochs {}'.format(epoch+1, time.time()-start))
```
#### 执行预测任务
当完成了充分次数迭代后的模型保存下来后即可加载最新的权重文件开展实际应用场景下的推理计算活动了。此时只需调用`evaluate()`方法传入待转换的新样本即可获得对应的输出结果:
```python
def evaluate(sentence):
attention_plot = np.zeros((max_length_targ, max_length_inp))
sentence = preprocess_sentence(sentence)
inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
maxlen=max_length_inp,
padding='post')
inputs = tf.convert_to_tensor(inputs)
result = ''
hidden = [tf.zeros((1, units))]
enc_out, enc_hidden = encoder(inputs, hidden)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)
for t in range(max_length_targ):
predictions, dec_hidden, attention_weights = decoder(dec_input,
dec_hidden,
enc_out)
predicted_id = tf.argmax(predictions[0]).numpy()
result += targ_lang.index_word[predicted_id] + ' '
if targ_lang.index_word[predicted_id] == '<end>':
return result, sentence, attention_plot
dec_input = tf.expand_dims([predicted_id], 0)
return result, sentence, attention_plot
result, sentence, _ = evaluate(u'hello .')
print(f'{sentence} -> {result}')
```