首先,感谢唐宇迪课程。
更多内容请加入学习。
接下来说说我对于seq2seq的理解。
seq2seq简单来说就一个编码,再解码的过程,tensorflow官网也有相应的解释seq2seq
下面是主要的架构,导入一些基础库,是必不可少的。
import pandas as pd
import numpy as np
import tensorflow as tf
import re
把文件传入进来,进行预处理,比如去掉多余的项、英文的连写、一些特殊符号,还有停用词。(停用词网上有很多)
预处理中最关键的当然是word2vec的词向量转换,可以自己训练,也可以拿别人现成的,我还没有GPU环境,所以用的是别人的,目前最新的是17.06
把文本中所有的词依次转换为词向量之后,还需要加入开始符和停止符,同时记得计算下word2vec里,是否全包括了文本里的词。
reviews = pd.read_csv("")
reviews = reviews.dropna()
reviews = reviews.reset_index(drop=True)
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
}
def clean_text(text, remove_stopwords=True):
text = text.lower()
if True:
text = text.split()
new_text = []
for word in text:
if word in contractions:
new_text.append(contractions[word])
else:
new_text.append(word)
text = " ".join(new_text)
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub(r'\<a href', ' ', text)
text = re.sub(r'&', '', text)
text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
text = re.sub(r'<br />', ' ', text)
text = re.sub(r'\'', ' ', text)
words = open("")
stopwords = words.readline()
if remove_stopwords:
text = text.split()
stops = set(stopwords)
text = [w for w in text if not w in stops]
text = " ".join(text)
return text
clean_summaries = []
for summary in reviews.Summary:
clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Summaries are complete.")
clean_texts = []
for text in reviews.Text:
clean_texts.append(clean_text(text))
print("Texts are complete.")
def count_words(count_dict, text):
for sentence in text:
for word in sentence.split():
if word not in count_dict:
count_dict[word] = 1
else:
count_dict[word] += 1
word_counts = {}
count_words(word_counts, clean_summaries)
count_words(word_counts, clean_texts)
print("Size of Vocabulary:", len(word_counts))
embeddings_index = {}
with open('') as f:
for line in f:
values = line.split(' ')
word = val