【Transforms-驯化】一个通过seq2seq和Transforms实现文本生成的例子

该文章已生成可运行项目,

【Transforms-驯化】一个通过seq2seq和Transforms实现文本生成的例子
 
本次修炼方法请往下查看
在这里插入图片描述

🌈 欢迎莅临我的个人主页 👈这里是我工作、学习、实践 IT领域、真诚分享 踩坑集合,智慧小天地!
🎇 相关内容文档获取 微信公众号
🎇 相关内容视频讲解 B站

🎓 博主简介:AI算法驯化师,混迹多个大厂搜索、推荐、广告、数据分析、数据挖掘岗位 个人申请专利40+,熟练掌握机器、深度学习等各类应用算法原理和项目实战经验

🔧 技术专长: 在机器学习、搜索、广告、推荐、CV、NLP、多模态、数据分析等算法相关领域有丰富的项目实战经验。已累计为求职、科研、学习等需求提供近千次有偿|无偿定制化服务,助力多位小伙伴在学习、求职、工作上少走弯路、提高效率,近一年好评率100%

📝 博客风采: 积极分享关于机器学习、深度学习、数据分析、NLP、PyTorch、Python、Linux、工作、项目总结相关的实用内容。

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 


下滑查看解决方法

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

  

🎯 1.问题介绍

  文本生成目前都是通过gpt等大模型进行完成,本次介绍一下通过seq2sep以及Transforms来实现

💡 2. seq2sep

  • 总的代码目录如下所示:
  • 在这里插入图片描述

  数据处理代码data_process.py:

import random
import re
import jieba
import numpy as np

DATA_PATH = '../data/'


def get_single_corpus(file_path):
    """
    获取file_path文件对应的内容
    :return: file_path文件处理结果
    """
    corpus = ''
    # unuseful items filter
    r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./::;<=>?@★、…【】《》‘’[\\]^_`{|}~「」『』()]+'
    # with open('../stopwords.txt', 'r', encoding='utf8') as f:
    #     stop_words = [word.strip('\n') for word in f.readlines()]
    #     f.close()
    # print(stop_words)
    with open(file_path, 'r', encoding='ANSI') as f:
        corpus = f.read()
        corpus = re.sub(r1, '', corpus)
        corpus = corpus.replace('\n', '')
        corpus = corpus.replace('\u3000', '')
        corpus = corpus.replace('本书来自免费小说下载站更多更新免费电子书请关注', '')
        f.close()
    # corpus.replace('。', ' 。')
    # corpus.replace(',', ' ,')
    words = list(jieba.cut(corpus))
    print("Corpus length: {}".format(len(words)))
    return words
    # return [word for word in words if word not in stop_words]


def get_dataset(data):
    """
    :param data: 分词结果
    :return: 落库,段落对应的下一个词,词库和词库索引
    """
    max_len = 60
    step = 3
    sentences = []
    next_tokens = []

    # for i in range(0, len(data)-max_len, step):
    #     sentences.append(data[i: i+max_len])
    #     next_tokens.append(data[i+max_len])
    # print('Number of sequences: {}'.format(len(sentences)))

    tokens = list(set(data))
    tokens_indices = {token: tokens.index(token) for token in tokens}
    print('Unique tokens:', len(tokens))

    for i in range(0, len(data) - max_len, step):
        sentences.append(
            list(map(lambda t: tokens_indices[t], data[i: i + max_len])))
        next_tokens.append(tokens_indices[data[i + max_len]])
    print('Number of sequences:', len(sentences))

    print('Vectorization...')
    next_tokens_one_hot = []
    for i in next_tokens:
        y = np.zeros((len(tokens),))
        y[i] = 1
        next_tokens_one_hot.append(y)
    # print(sentences[0], next_tokens_one_hot[0])
    # print(len(sentences), len(next_tokens_one_hot))
    return sentences, next_tokens_one_hot, tokens, tokens_indices


if __name__ == '__main__':
    file = DATA_PATH + '笑傲江湖.txt'
    d = get_single_corpus(file)
    _x, _y, _tokens, _tokens_indices = get_dataset(d)


  • 模型的代码:seq2sep.py
import jieba
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from data_process import *
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow import keras

callbacks_list = [
    keras.callbacks.ModelCheckpoint(  # 在每轮完成后保存权重
        filepath='text_gen.h5',
        monitor='loss',
        save_best_only=True,
    ),
    keras.callbacks.ReduceLROnPlateau(  # 不再改善时降低学习率
        monitor='loss',
        factor=0.5,
        patience=1,
    ),
    keras.callbacks.EarlyStopping(  # 不再改善时中断训练
        monitor='loss',
        patience=3,
    ),
]


class SeqToSeq(nn.Module):
    def __init__(self, len_token, embedding_size):
        super(SeqToSeq, self).__init__()
        self.encode = nn.Embedding(len_token, embedding_size)
        self.lstm = nn.LSTM(embedding_size, embedding_size, 2, batch_first=True)
        self.decode = nn.Sequential(
            nn.Linear(embedding_size, len_token),
            nn.Sigmoid()
        )

    def forward(self, x):
        print(x.shape)
        em = self.encode(x).unsqueeze(dim=1)
        print(em.shape)
        mid, _ = self.lstm(em)
        print(mid[:,0,:].shape)
        res = self.decode(mid[:, 0, :])
        print(res.shape)
        return res


def sample(preds, temperature=1.0):
    """
    对模型得到的原始概率分布重新加权,并从中抽取一个 token 索引
    :param preds:预测的结果
    :param temperature:温度
    :return:重新加权后的最大值下标
    """
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def train(x, y, tokens, tokens_indices, epochs=200):
    x = np.asarray(x)
    y = np.asarray(y)
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.shuffle(buffer_size=4096)
    dataset = dataset.batch(128)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    model = models.Sequential([
        layers.Embedding(len(tokens), 256),
        layers.LSTM(256),
        layers.Dense(len(tokens), activation='softmax')
    ])

    optimizer = optimizers.RMSprop(lr=0.1)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    for e in range(epochs):

        model.fit(dataset, epochs=1, callbacks=callbacks_list)

    # text = '令狐冲这时已退到殿口,与教主的座位相距已遥,灯光又暗,远远望见去,任我行的容貌已颇为朦胧,心下忽想:“坐在这位子上的,是任我行还是东方不败,却有什么分别?”'
        text = '青衣剑士连劈三剑,锦衫剑士一一格开。青衣剑士一声吒喝,长剑从左上角直划而下,势劲力急。锦衫剑士身手矫捷,向后跃开,避过了这剑。他左足刚着地,身子跟着弹起,刷刷两剑,向对手攻去。青衣剑士凝里不动,嘴角边微微冷笑,长剑轻摆,挡开来剑。'
        print(text, end='')
        if e % 20 == 0:
            for temperature in [0.2, 0.5, 1.0, 1.2]:
                text_cut = list(jieba.cut(text))[:60]
                print('\n temperature: ', temperature)
                print(''.join(text_cut), end='')
                for i in range(100):

                    sampled = np.zeros((1, 60))
                    for idx, token in enumerate(text_cut):
                        if token in tokens_indices:
                            sampled[0, idx] = tokens_indices[token]
                    preds = model.predict(sampled, verbose=0)[0]
                    next_index = sample(preds, temperature=1)
                    next_token = tokens[next_index]
                    print(next_token, end='')

                    text_cut = text_cut[1: 60] + [next_token]


if __name__ == '__main__':
    file = DATA_PATH + '越女剑.txt'
    d = get_single_corpus(file)
    _x, _y, _tokens, _tokens_indices = get_dataset(d)
    train(_x, _y, _tokens, _tokens_indices)

💡 3. transformers代码

  得到训练的代码后进行预测,具体的代码如下所示:

  • 代码里面有几个start_string为开始的文本,num_generate为生成的文本个数
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 示例文本数据
text_data = ''.join([i.strip('\n').replace(' ', '') for i in open('总.txt', 'r').readlines()])
# 分词
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])
sequences = tokenizer.texts_to_sequences([text_data])

# 填充序列
padded_sequences = pad_sequences(sequences, maxlen=10, padding='post', truncating='post')

# 查看词汇表
vocab_size = len(tokenizer.word_index) + 1
print("词汇表大小:", vocab_size)
print("序列化后的文本:", sequences)
print("填充后的序列:", padded_sequences)

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

def create_transformer_model(vocab_size, embedding_dim, hidden_dim, num_heads, num_layers):
    inputs = Input(shape=(10,))
    embedding_layer = Embedding(vocab_size, embedding_dim)(inputs)
    transformer_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_dim)(embedding_layer, embedding_layer)
    lstm_layer = LSTM(hidden_dim, return_sequences=True)(transformer_layer)
    outputs = Dense(vocab_size, activation='softmax')(lstm_layer)
    model = Model(inputs=inputs, outputs=outputs)
    return model

model = create_transformer_model(vocab_size, embedding_dim=64, hidden_dim=64, num_heads=2, num_layers=1)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

# 训练模型
model.fit(padded_sequences, padded_sequences, epochs=10)

def generate_text(model, start_string, num_generate=10):
    # 将输入字符串转换为序列
    input_seq = tokenizer.texts_to_sequences([start_string])[0]
    input_seq = pad_sequences([input_seq], maxlen=10, padding='pre')
    generated = []

    # 模型预测时不需要重置状态,因为每次生成都是独立的
    for _ in range(num_generate):
        # 预测下一个字符
        predictions = model.predict(input_seq)
        
        # 获取预测概率最高的字符的索引
        predicted_id = np.argmax(predictions[0, -1])  # 使用 -1 来获取最后一个时间步的预测
        
        # 将预测的字符添加到生成的文本中
        generated.append(tokenizer.index_word.get(predicted_id, ''))  # 使用 .get() 避免 KeyError
        
        # 更新输入序列以包含最新的预测
        # 创建一个新的 tensor 来存储更新后的序列
        new_input_seq = tf.concat([input_seq[:, 1:], tf.constant([[predicted_id]])], axis=1)
        
        input_seq = new_input_seq
    return ' '.join(generated)

# 生成文本
print(generate_text(model, start_string="两千年来人们都知道"))
本文章已经生成可运行项目
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

算法驯化师

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值