数据文件:news_classify_data.txt
data_path = 'data/data6825/news_classify_data.txt'
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
行数 len(lines):56821
从原文件中需要提取题目、类别,划分训练集、测试集
str1 = '6660024717287621123_!_0_!_文化_!_老祖宗俗语:“万恶淫为首”,下一句更是精华,却没几个人能做到\n'
print(str1.split('_!_'))
print('题目:',str1.split('_!_')[-1])
print('类别:',str1.split('_!_')[1])
1.创建数据字典dict_txt
dict_ = set()#集合,自动去重
lines = ['老祖宗俗语','老俗语']
for line in lines:
for s in line:
dict_.add(s)
dict_
{'俗', '宗', '祖', '老', '语'}
dict_list = []
i = 0
for s in dict_:
dict_list.append([s,i])
# print(s)
i = i+1
print('列表格式dict_list:',dict_list)
print('字典格式dict_txt:',dict(dict_list))
data_path = 'data/news_classify_data.txt'
#读文件
def read_file(data_path):
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
return lines
#构建数据字典
def create_dict(data_path):
dict_set = set()#集合,自动去重
lines = read_file(data_path)
# 把数据生成一个集合
for line in lines:
title = line.split('_!_')[-1].replace('\n', '')
for s in title:
dict_set.add(s)#题目中的每个字存入字典
# 把集合转换成字典,一个字对应一个数字
dict_list = []
i = 0
#每个字符一个编码
for s in dict_set:
dict_list.append([s,i])
i = i+1
# 添加未知字符
dict_txt = dict(dict_list)
end_dict = {"<unk>": i}
dict_txt.update(end_dict)
with open('dict_txt.txt', 'w', encoding='utf-8') as f:
f.write(str(dict_txt))
数据字典dict_txt
2.构建训练集测试集
def create_data_list(data_path,dict_path):
lines = read_file(data_path)
dict_txt = eval(read_file(dict_path)[0])#eval() 函数用来执行一个字符串表达式,并返回表达式的值。
for line in lines:
title = line.split('_!_')[-1].replace('\n', '')
kind = line.split('_!_')[1]
labels = ""
for s in title:
num = str(dict_txt[s])
labels = labels + num + ','
labels = labels[:-1]#去掉最后一个逗号
labels = labels +'\t' + kind +'\n'
write_file('news_all.txt',labels)