从网上下载的小说的txt文件往往含有较多的换行符,空格等字符,这在生成训练样本的时候是不需要的,甚至会出错,需要对这些文本进行预处理,将所有行合并成一行,并删除无用的字符,
import random
with open('novels.txt', mode='r', encoding='utf-8') as f:
data = f.readlines()
lines = []
for line in data:
line_striped = line.strip()
line_striped = line_striped.replace('\u3000', '')
line_striped = line_striped.replace(' ', '')
line_striped = line_striped.replace("\00", "")
line_striped = line_striped.replace(" ", "")
if line_striped != u'' and len(line.strip()) > 1:
lines.append(line_striped)
# 所有行合并成一行
split_chars = [',', ',', ':', '-', ' ', ';', '。']
splitchar = random.choice(split_chars)
whole_line = splitchar.join(lines)
#print(len(list(whole_line)))
print(len(set(list(whole_line))))
print(len(list(whole_line)))
with open('novels_corpus.txt', 'w', encoding='utf-8') as r:
chars = str(whole_line)
print(len(chars))
chars.replace(' ','') #再次确认删除空格
print(len(chars))
r.write(chars)