唐宇迪Seq2Seq代码+注释（tensorflow1.2版本）_唐宇迪 tansorflow自然语言处理代码-优快云博客

本文链接：https://blog.youkuaiyun.com/zhylhy520/article/details/83088277

首先感谢唐宇迪课程https://edu.youkuaiyun.com/course/detail/3921/68734?auto_start=1

序列生成首先要做好数据预处理，第一步是要将文本数据转化为数值数据，可以使用word2vec训练词向量模型。在与以往文本分类需要文本长度保持一致不同，seq2seq只需要一个bach_size内的dequence_length保持一致，其他可以不一致。接着写encoder层和decoder层，encoder层的输出是中间向量。有的模型将这一中间向量参与训练decoder层所有训练，也有模型只是将该向量作为第一步输入。decoder层则包括训练和预测两个功能，预测的时候需要加[start]和[eos]，此外还引进了注意力机制，在模型的输入也采取倒序输入，这都有利于提高模型的分类效果。encoder_input:sequence+<EOS>,作为目标句的输入为<GO>+target_sequence

import pandas as pd
import re
import numpy as np
import tensorflow as tf
import time

filename = 'E:\DataSets\Reviews.csv\Reviews.csv'
reviews = pd.read_csv(filename)
# print(reviews.isnull().sum())
# 去除缺失的空值
reviews = reviews.dropna()
# 去除不需要的列
reviews = reviews.drop(
    ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], 1)
# 对去除以后的内容重新分配index
reviews = reviews.reset_index(drop=True)
# print(reviews.head())，显示去除以后的前十个text与summary
#连词转换词典
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what've": "what have",
    "what'd": "what did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

#对文本内容进行清洗，全部转化为小写，最后形成'i want to rock you'形式
def clean_text(text, remove_stopwords=True):
    text = text.lower()
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        #形成新的句子，类型为str
        text = " ".join(new_text)
    #去除一些特殊符号
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text =