首先感谢唐宇迪课程https://edu.youkuaiyun.com/course/detail/3921/68734?auto_start=1
序列生成首先要做好数据预处理,第一步是要将文本数据转化为数值数据,可以使用word2vec训练词向量模型。在与以往文本分类需要文本长度保持一致不同,seq2seq只需要一个bach_size内的dequence_length保持一致,其他可以不一致。接着写encoder层和decoder层,encoder层的输出是中间向量。有的模型将这一中间向量参与训练decoder层所有训练,也有模型只是将该向量作为第一步输入。decoder层则包括训练和预测两个功能,预测的时候需要加[start]和[eos],此外还引进了注意力机制,在模型的输入也采取倒序输入,这都有利于提高模型的分类效果。encoder_input:sequence+<EOS>,作为目标句的输入为<GO>+target_sequence
import pandas as pd
import re
import numpy as np
import tensorflow as tf
import time
filename = 'E:\DataSets\Reviews.csv\Reviews.csv'
reviews = pd.read_csv(filename)
# print(reviews.isnull().sum())
# 去除缺失的空值
reviews = reviews.dropna()
# 去除不需要的列
reviews = reviews.drop(
['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], 1)
# 对去除以后的内容重新分配index
reviews = reviews.reset_index(drop=True)
# print(reviews.head()),显示去除以后的前十个text与summary
#连词转换词典
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "can not",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what've": "what have",
"what'd": "what did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}
#对文本内容进行清洗,全部转化为小写,最后形成'i want to rock you'形式
def clean_text(text, remove_stopwords=True):
text = text.lower()
if True:
text = text.split()
new_text = []
for word in text:
if word in contractions:
new_text.append(contractions[word])
else:
new_text.append(word)
#形成新的句子,类型为str
text = " ".join(new_text)
#去除一些特殊符号
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text =