1:整体思路就是先统计古诗词中的词频,进行词到数字的映射。生成poems_vector(词向量),word_to_int(词数字映射关系),words(词表)。
预处理古诗词代码:
import collections
import numpy as np
def process_poems(file_path):
poems = []
with open(file_path,'r',encoding='utf-8') as f:
for line in f.readlines():
title,content = line.strip().split(":")
if '_' in content or "(" in content or ")" in content or "{" in content or "}" in content:
continue
if len(content) <5 or len(content) > 100 :
continue
content = 'G' + content + 'E'
poems.append(content)
all_words = []
for poem in poems:
all_words+=[word for word in poem]
#print(all_words)
counter = collections.Counter(all_words)
counter_pairs = counter.most_common()
words,_ = zip(*counter_pairs) #词汇表
word_to_int = dict() #每个字的映射
for word in words:
word_to_int[word] = len(word_to_int)
#获得每行诗的向量
poems_vector = [list(map(lambda word : word_to_int.get(word,len(words)) ,poem))for poem in poems ]
word_to_int[' '] = len(word_to_int)+1
return poems_vector,word_to_int,words
def generate_batch(batch_size,poems_vector,word_to_int):
n_chunk = len(po