【Transforms-驯化】一个通过seq2seq和Transforms实现文本生成的例子
本次修炼方法请往下查看

🌈 欢迎莅临我的个人主页 👈这里是我工作、学习、实践 IT领域、真诚分享 踩坑集合,智慧小天地!
🎇 相关内容文档获取 微信公众号
🎇 相关内容视频讲解 B站
🎓 博主简介:AI算法驯化师,混迹多个大厂搜索、推荐、广告、数据分析、数据挖掘岗位 个人申请专利40+,熟练掌握机器、深度学习等各类应用算法原理和项目实战经验。
🔧 技术专长: 在机器学习、搜索、广告、推荐、CV、NLP、多模态、数据分析等算法相关领域有丰富的项目实战经验。已累计为求职、科研、学习等需求提供近千次有偿|无偿定制化服务,助力多位小伙伴在学习、求职、工作上少走弯路、提高效率,近一年好评率100% 。
📝 博客风采: 积极分享关于机器学习、深度学习、数据分析、NLP、PyTorch、Python、Linux、工作、项目总结相关的实用内容。
下滑查看解决方法
🎯 1.问题介绍
文本生成目前都是通过gpt等大模型进行完成,本次介绍一下通过seq2sep以及Transforms来实现
💡 2. seq2sep
- 总的代码目录如下所示:

数据处理代码data_process.py:
import random
import re
import jieba
import numpy as np
DATA_PATH = '../data/'
def get_single_corpus(file_path):
"""
获取file_path文件对应的内容
:return: file_path文件处理结果
"""
corpus = ''
# unuseful items filter
r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./::;<=>?@★、…【】《》‘’[\\]^_`{|}~「」『』()]+'
# with open('../stopwords.txt', 'r', encoding='utf8') as f:
# stop_words = [word.strip('\n') for word in f.readlines()]
# f.close()
# print(stop_words)
with open(file_path, 'r', encoding='ANSI') as f:
corpus = f.read()
corpus = re.sub(r1, '', corpus)
corpus = corpus.replace('\n', '')
corpus = corpus.replace('\u3000', '')
corpus = corpus.replace('本书来自免费小说下载站更多更新免费电子书请关注', '')
f.close()
# corpus.replace('。', ' 。')
# corpus.replace(',', ' ,')
words = list(jieba.cut(corpus))
print("Corpus length: {}".format(len(words)))
return words
# return [word for word in words if word not in stop_words]
def get_dataset(data):
"""
:param data: 分词结果
:return: 落库,段落对应的下一个词,词库和词库索引
"""
max_len = 60
step = 3
sentences = []
next_tokens = []
# for i in range(0, len(data)-max_len, step):
# sentences.append(data[i: i+max_len])
# next_tokens.append(data[i+max_len])
# print('Number of sequences: {}'.format(len(sentences)))
tokens = list(set(data))
tokens_indices = {token: tokens.index(token) for token in tokens}
print('Unique tokens:', len(tokens))
for i in range(0, len(data) - max_len, step):
sentences.append(
list(map(lambda t: tokens_indices[t], data[i: i + max_len])))
next_tokens.append(tokens_indices[data[i + max_len]])
print('Number of sequences:', len(sentences))
print('Vectorization...')
next_tokens_one_hot = []
for i in next_tokens:
y = np.zeros((len(tokens),))
y[i] = 1
next_tokens_one_hot.append(y)
# print(sentences[0], next_tokens_one_hot[0])
# print(len(sentences), len(next_tokens_one_hot))
return sentences, next_tokens_one_hot, tokens, tokens_indices
if __name__ == '__main__':
file = DATA_PATH + '笑傲江湖.txt'
d = get_single_corpus(file)
_x, _y, _tokens, _tokens_indices = get_dataset(d)
- 模型的代码:seq2sep.py
import jieba
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from data_process import *
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow import keras
callbacks_list = [
keras.callbacks.ModelCheckpoint( # 在每轮完成后保存权重
filepath='text_gen.h5',
monitor='loss',
save_best_only=True,
),
keras.callbacks.ReduceLROnPlateau( # 不再改善时降低学习率
monitor='loss',
factor=0.5,
patience=1,
),
keras.callbacks.EarlyStopping( # 不再改善时中断训练
monitor='loss',
patience=3,
),
]
class SeqToSeq(nn.Module):
def __init__(self, len_token, embedding_size):
super(SeqToSeq, self).__init__()
self.encode = nn.Embedding(len_token, embedding_size)
self.lstm = nn.LSTM(embedding_size, embedding_size, 2, batch_first=True)
self.decode = nn.Sequential(
nn.Linear(embedding_size, len_token),
nn.Sigmoid()
)
def forward(self, x):
print(x.shape)
em = self.encode(x).unsqueeze(dim=1)
print(em.shape)
mid, _ = self.lstm(em)
print(mid[:,0,:].shape)
res = self.decode(mid[:, 0, :])
print(res.shape)
return res
def sample(preds, temperature=1.0):
"""
对模型得到的原始概率分布重新加权,并从中抽取一个 token 索引
:param preds:预测的结果
:param temperature:温度
:return:重新加权后的最大值下标
"""
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
def train(x, y, tokens, tokens_indices, epochs=200):
x = np.asarray(x)
y = np.asarray(y)
dataset = tf.data.Dataset.from_tensor_slices((x, y))
dataset = dataset.shuffle(buffer_size=4096)
dataset = dataset.batch(128)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
model = models.Sequential([
layers.Embedding(len(tokens), 256),
layers.LSTM(256),
layers.Dense(len(tokens), activation='softmax')
])
optimizer = optimizers.RMSprop(lr=0.1)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
for e in range(epochs):
model.fit(dataset, epochs=1, callbacks=callbacks_list)
# text = '令狐冲这时已退到殿口,与教主的座位相距已遥,灯光又暗,远远望见去,任我行的容貌已颇为朦胧,心下忽想:“坐在这位子上的,是任我行还是东方不败,却有什么分别?”'
text = '青衣剑士连劈三剑,锦衫剑士一一格开。青衣剑士一声吒喝,长剑从左上角直划而下,势劲力急。锦衫剑士身手矫捷,向后跃开,避过了这剑。他左足刚着地,身子跟着弹起,刷刷两剑,向对手攻去。青衣剑士凝里不动,嘴角边微微冷笑,长剑轻摆,挡开来剑。'
print(text, end='')
if e % 20 == 0:
for temperature in [0.2, 0.5, 1.0, 1.2]:
text_cut = list(jieba.cut(text))[:60]
print('\n temperature: ', temperature)
print(''.join(text_cut), end='')
for i in range(100):
sampled = np.zeros((1, 60))
for idx, token in enumerate(text_cut):
if token in tokens_indices:
sampled[0, idx] = tokens_indices[token]
preds = model.predict(sampled, verbose=0)[0]
next_index = sample(preds, temperature=1)
next_token = tokens[next_index]
print(next_token, end='')
text_cut = text_cut[1: 60] + [next_token]
if __name__ == '__main__':
file = DATA_PATH + '越女剑.txt'
d = get_single_corpus(file)
_x, _y, _tokens, _tokens_indices = get_dataset(d)
train(_x, _y, _tokens, _tokens_indices)
💡 3. transformers代码
得到训练的代码后进行预测,具体的代码如下所示:
- 代码里面有几个start_string为开始的文本,num_generate为生成的文本个数
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# 示例文本数据
text_data = ''.join([i.strip('\n').replace(' ', '') for i in open('总.txt', 'r').readlines()])
# 分词
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])
sequences = tokenizer.texts_to_sequences([text_data])
# 填充序列
padded_sequences = pad_sequences(sequences, maxlen=10, padding='post', truncating='post')
# 查看词汇表
vocab_size = len(tokenizer.word_index) + 1
print("词汇表大小:", vocab_size)
print("序列化后的文本:", sequences)
print("填充后的序列:", padded_sequences)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
def create_transformer_model(vocab_size, embedding_dim, hidden_dim, num_heads, num_layers):
inputs = Input(shape=(10,))
embedding_layer = Embedding(vocab_size, embedding_dim)(inputs)
transformer_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_dim)(embedding_layer, embedding_layer)
lstm_layer = LSTM(hidden_dim, return_sequences=True)(transformer_layer)
outputs = Dense(vocab_size, activation='softmax')(lstm_layer)
model = Model(inputs=inputs, outputs=outputs)
return model
model = create_transformer_model(vocab_size, embedding_dim=64, hidden_dim=64, num_heads=2, num_layers=1)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()
# 训练模型
model.fit(padded_sequences, padded_sequences, epochs=10)
def generate_text(model, start_string, num_generate=10):
# 将输入字符串转换为序列
input_seq = tokenizer.texts_to_sequences([start_string])[0]
input_seq = pad_sequences([input_seq], maxlen=10, padding='pre')
generated = []
# 模型预测时不需要重置状态,因为每次生成都是独立的
for _ in range(num_generate):
# 预测下一个字符
predictions = model.predict(input_seq)
# 获取预测概率最高的字符的索引
predicted_id = np.argmax(predictions[0, -1]) # 使用 -1 来获取最后一个时间步的预测
# 将预测的字符添加到生成的文本中
generated.append(tokenizer.index_word.get(predicted_id, '')) # 使用 .get() 避免 KeyError
# 更新输入序列以包含最新的预测
# 创建一个新的 tensor 来存储更新后的序列
new_input_seq = tf.concat([input_seq[:, 1:], tf.constant([[predicted_id]])], axis=1)
input_seq = new_input_seq
return ' '.join(generated)
# 生成文本
print(generate_text(model, start_string="两千年来人们都知道"))


被折叠的 条评论
为什么被折叠?



