import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import os
import sys
import time
import sklearn
from tensorflow import keras
import tensorflow as tf
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
print(module.__name__, module.__version__)
2.0.0
sys.version_info(major=3, minor=7, micro=3, releaselevel=‘final’, serial=0)
matplotlib 3.0.3
numpy 1.16.2
pandas 0.24.2
sklearn 0.20.3
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf
7-3 电影评分数据
Embedding概论
$
在开始RNN前首先是需要将数据做Embedding的,做Embedding就是将每个单词或者字符的one-hot编码转为Dense编码,在实际操作过程中的操作步骤如下
1. 构建语义库
2. 将数据集中单词或字符转为对应在语义库中索引,vocab_size为数据集索引的最大值
3. 手动确定需要变成的Dense集的维度embedding_dim
keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
$
Embedding有2种方式如下图
imdb = keras.datasets.imdb
vocab_size = 10000
index_from = 3
(train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words=vocab_size,
index_from = index_from)
print(train_data.shape, train_labels.shape)
print(train_data[0], train_labels[0])
print(len(train_data[0]), len(train_data[1] ))
print(test_data.shape, test_labels.shape)
word_index = imdb.get_word_index()
word_index = {k:(v+3) for k,v in word_index.items()}
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['END'] = 3
reverse_word_index = dict([(v, k) for k,v in word_index.items()])
def decode_review(text_ids):
return " ".join([reverse_word_index.get(word_id, '<UNK>') for word_id in text_ids ])
decode_review(train_data[0])
# 数据补全 paddling
max_length = 500
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
value = word_index['<PAD>'],
padding = 'post', # post放在句子后面,pre:放在前面
maxlen = max_length)
test_data = keras.preprocessing.sequence.pad_sequences(test_data,
value = word_index['<PAD>'],
padding = 'post', # post放在句子后面,pre:放在前面
maxlen = max_length)
# https://yq.aliyun.com/articles/221681 Embedding
embedding_dim = 16 # 每个word的向量长度16
batch_size = 128
model = keras.models.Sequential([
# Embedding 层:1. define matrix: [vocab_size, embedding_dim] ;
# 2. 将一个样本[vocab1,vocab2,...] 变成 max_length * embedding的矩阵
# 3. 如果是批次读取 则矩阵shape = batch_size * max_length * embedding
keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
# GlobalAveragePooling1D : 将上层的batch_size * max_length * embedding 变成 batch_size * embedding
# keras.layers.Flatten(),
keras.layers.GlobalAveragePooling1D(), # 上面已经做了padding,这里可以不用GlobalAveragePooling1D,可以直接展开 keras.layers.Flatten(),
keras.layers.Dense(64, activation='relu'),
keras.layers.Dense(1, activation='sigmoid')
])
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(train_data, train_labels, epochs=20, batch_size = batch_size, validation_split=0.2)
# summary中 Output Shape 解读 (None, 500, 16)
# None:样本数(batch_input_shape指定),
# 500:一条序列多少个词 (input_length指定 )
# 16 :一个词的Dense编码维度为多少 (embedding_dim指定)
plot_learning_curving(history) # 过拟合现象
上面model使用了合并与padding,缺点如下:所以需要使用循环神经网络
- 信息丢失
- 多个embedding(词嵌入)后使用了GlobalAveragePooling1D均值合并,可能序列里有很多pad,
- 这就是pad噪音,即便序列里没有pad,但是没有体现出句子中词语的主次,像这个数据集是电影
- 评分的数据集,主谓语不那么重要,表达情感的词语更重要(pad噪音,无主次)
- 无效计算太多,低效
- 太多的padding
上面模型出现过拟合,可以使用单层RNN,双层RNN
7-5 RNN
# numpy 实现 RNN前向传播
timesteps = 100
input_features = 32
output_features = 64
# 输入有100个时间点,每个时间点有32维的数据
inputs = np.random.random((timesteps,input_features)) # (100,32)
state_t = np.zeros((output_features,)) # (64,)
W = np.random.random((output_features,input_features)) # input的权重 (64, 32)
U = np.random.random((output_features,output_features)) # state的权重 (64,64)
b = np.random.random((output_features,)) # bias (64,)
successive_outputs = []
for input_t in inputs:
# 按timesteps进行迭代
# output_t是一个64维的向量
output_t = np.tanh(np.dot(W,input_t)+np.dot(U,state_t)+b) # (64,)
# 将当前时刻的输出保存到successive_outputs中
successive_outputs.append(output_t)
# 当前时刻的输出作为下一时刻的state
state_t = output_t
final_output_sequence = np.concatenate(successive_outputs,axis=0)
final_output_sequence.shape
rnn 隐藏单元
单层RNN
# https://yq.aliyun.com/articles/221681 Embedding
embedding_dim = 16 # 每个word的向量长度16
batch_size = 128
single_rnn_model = keras.models.Sequential([
keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
# keras.layers.SimpleRNN
keras.layers.SimpleRNN(units=64, return_sequences = False), # 5184=64 * 16 + 64*64 +64, 这里的64为units=64,不是下面Dense的64
keras.layers.Dense(64, activation='relu'),
keras.layers.Dense(1, activation='sigmoid')
])
single_rnn_model.summary()
single_rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
双向多层RNN
# 双向多层RNN,改善验证集精度 但是发现过拟合,接来下使用单层的双向rnn并减少参数看下结果
embedding_dim = 16
batch_size = 128
model = keras.models.Sequential([
keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
# keras.layers.Bidirectional(keras.layers.LSTM
keras.layers.Bidirectional(keras.layers.SimpleRNN(units=64, return_sequences = True)), # 10368 = (16*64 + 64*64 + 64) * 2
keras.layers.Bidirectional(keras.layers.SimpleRNN(units=64, return_sequences = False)), # 24704 = (128*64 + 64*64 + 64)*2
keras.layers.Dense(64, activation='relu'),
keras.layers.Dense(1, activation='sigmoid')
])
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
单层的双向rnn
# 单层的双向rnn并减少参数看下结果,发现还是过拟合,比7-3中的普通神经网络效果还差,说明RNN太过强大,容易出现过拟合,
# 可以用dropout或正则化处理
embedding_dim = 16
batch_size = 512
model = keras.models.Sequential([
keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
keras.layers.Bidirectional(keras.layers.SimpleRNN(units=32, return_sequences = False)),
keras.layers.Dense(32, activation='relu'),
keras.layers.Dense(1, activation='sigmoid')
])
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
7-6 字符模型进行文本生成
# 使用莎士比亚数据集 https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath).read()
print(len(text))
print(text[:30])
模型构建思路
$
模型构建思路
- generete vocab
- build mapping : char -> id ,{char:id} char与id的字典形式
- data -> id_data: 将文本data通过第二步的字典转为数字
- 定义输入输出 ,如 输入为 abcd -> 则输出为 bcdy, 通过切分id_data获得
- 将输入(训练集)输出(标签)用构建的模型进行训练参数并保存
- 加载保存的参数进行预测
$
# 1. generate vocab
vocab = sorted(set(text))
print(len(vocab))
print(vocab)
# 2 . build mapping : char -> id
char2idx = {char:idx for idx,char in enumerate(vocab)}
print(char2idx)
idx2char = np.array(vocab)
# 3. data -> id_data
text_as_int = np.array([char2idx[x] for x in text])
print(text_as_int[0:10])
print(text[0:10])
def split_input_target(id_text):
# abcde -> abcd , bcde
return id_text[0:-1] ,id_text[1:]
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length+1, drop_remainder=True) # drop_remainder: 如果最后一个batch不够batch_size则舍弃
for seq in seq_dataset.take(2):
print(seq)
print(repr("".join(idx2char[seq.numpy()])))
seq_dataset = seq_dataset.map(lambda x: split_input_target(x))]
for item_input, item_output in seq_dataset.take(1):
print(item_input.numpy(), item_input.shape)
print(item_output.numpy())
batch_size = 64
seq_dataset = seq_dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
model = keras.models.Sequential([
keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
keras.layers.SimpleRNN(units = rnn_units, return_sequences = True) ,# 每个时间戳都要输出;添加这2个参数 # , statful=True, recurrent_initializer='glorot_uniform' 效果更好
keras.layers.Dense(vocab_size)
])
return model
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size )
model.summary()
# summary中 Output Shape 解读(64, None, 256) ,
# 64:样本数(batch_input_shape指定),
# None:一条序列多少个词 (input_length指定 )
# 256 :一个词的Dense编码维度为多少 (embedding_dim指定)
for input_example_batch, target_example_batch in seq_dataset.take(1):
example_batch_predictions = model(input_example_batch)
print(example_batch_predictions.shape, target_example_batch.shape)
(64, 100, 65) (64, 100)
# 有随机采样:随机从输出结果中抽取n_samples个值 和贪心采样:获取值最大的那个值,
# 我们使用随机采样,
samples_indices = tf.random.categorical(logits=example_batch_predictions[0], num_samples=1) # (100,65) - > (100,1)
samples_indices = tf.squeeze(samples_indices, axis=-1)
samples_indices
# 查看下结果, predict 输出的是乱七八糟的,因为还没有训练
print("input: ", repr("".join( idx2char[input_example_batch[0]] )))
print("output: ", repr("".join( idx2char[target_example_batch[0]] )))
print("predice;" ,repr("".join( idx2char[samples_indices] )))
# 自定义损失函数
def loss(labels, logits):
#from_logits 的default = False;为False时会将已经经过sigmoid或softmax概率化的值重新返回为logits,这里不是概率化的值
# 所以将from_logits设为True,我不知道为啥model最后输出不用激活函数
return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(optimizer='adam', loss=loss)
example_loss = loss(target_example_batch, example_batch_predictions)
# target_example_batch : (64,100) ; example_batch_predictions : (64, 100, 65)
# 因为loss使用的是sparse_categorical_crossentropy,则说明要求输入的labels是数值型,然后会将数值型转换为One-hot
# 并与logits进行交叉熵计算获得损失值
print(example_loss.shape)
print(example_loss.numpy().mean())
output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
os.mkdir(output_dir)
checkpoint_predix = os.path.join(output_dir, 'model_{epoch:02d}')
checkpoint_callbacks =[
keras.callbacks.ModelCheckpoint(filepath = checkpoint_predix, save_weights_only= True)
]
epochs = 20
history = model.fit(seq_dataset, epochs = epochs, callbacks=checkpoint_callbacks)
# 查看最新保存的模型
tf.train.latest_checkpoint(output_dir)
‘./text_generation_checkpoints/model_20’
# 从checkpoint载入模型
model2 = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1) # batch_size:1,一次只输出一个句子
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.TensorShape([1, None])) # 1:一个样本(因为batch_size设为1),None表示输入可以是变长的序列,build函数时输入接收什么格式 参考:https://blog.youkuaiyun.com/qq_34964399/article/details/104084070
# 文本生成流程
# start ch Sequnce A # 一个字符A
# A -> model ->b : 将A输入model输出一个字符b,输出的字符b是需要采样的,用tf.random.categorical进行采样
# A.append(b) -> B :将字符b添加到A后面变成序列B ,此时B = Ab
# B -> model ->c
# B.append(c) -> C # C = Abc
# ... 所以说输入的序列是变长的,上面的build接收变长序列
model2.summary()
def generate_text(model, start_string, num_generate=1000 ):
input_val = [char2idx[x] for x in start_string]
input_val = tf.expand_dims(input_val, 0)
text = []
model.reset_states()
temperature = 0.5 # if temperature>1,random ;temperature<1,越倾向于greedy的算法,取最大值,可能是因为
# 当<1时,predictions / temperature可能分布更陡峭,ramdom取值的时候回偏向取概率更大的值
for _ in range(num_generate):
predictions = model2(input_val) # dim: (batch_size, len(start_string), 100) batch_size = 1, <==> model2.predict(input_val)
#
predictions = predictions / temperature
predictions = tf.squeeze(predictions, axis=0) # 去除维度为1的,为了输入下面的接口。现在dim: (len(start_string), 100)
predict_id = tf.random.categorical(predictions, num_samples=1)[-1][0].numpy() # 通过start_string预测出的字符
text.append(predict_id)
input_val = tf.expand_dims([predict_id], 0)
# input_val = tf.concat([input_val, tf.expand_dims([predict_id],0) ], axis=1) # 预测的字符加入到原始字符进行下一步预测,这样的效果会好
return start_string + "".join(idx2char[text])
print(generate_text(model2, 'All we', num_generate=500 ))