数据读取与保存
primary_math_data = pd.read_csv("primary_math_update_model_0731.csv", encoding="utf-8",index_col=0)
result_zhiliang_v2.to_csv("submission.csv",encoding='utf_8',index=False)
primary_data.to_csv("0901.csv",sep="\t",encoding='utf_8',index=False)
划分数据集
#random_state设置随机种子
sample_datas_class = primary_math_data.sample(frac=1,replace=False,random_state=514,axis=0)
sample_datas_class.info()
len_data_class = len(list(sample_datas_class['label']))
gap=round(len_data_class*0.8)
data_train_class = sample_datas_class[:gap]
print(len(data_train_class))
data_test_class = sample_datas_class[gap:]
print(len(data_test_class))
#重新设置索引
data_test_class=data_test_class.reset_index(drop=True)
数据清洗
正则化匹配去除非中文字符,通过unicode匹配非中文字符
import re
def clear_sysbol(line):
cleared = re.sub('[^\u4e00-\u9fa5^]', '', line)
return cleared
#去除非中文字符,添加content作为训练输入
df["content"]="类别"+df['category'].apply(lambda x:str(x))+"内容"+df["body"].apply(lambda x:clear_sysbol(str(x)))
上下拼接两个dataframe
df1 = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})
df2 = pd.DataFrame({'a':[8,8], 'b':[9,9]})
pd.concat([df1,df2],axis=0,ignore_index=True)
char2id
#制作char2id
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
def re_tokenize(texts):
# num_words:None或整数,处理的最大单词数量。少于此数的单词丢掉
tokenizer = Tokenizer(num_words=None,char_level=True)
tokenizer.fit_on_texts(texts)
char_list = tokenizer.word_index.keys()
print("char_list={}".format(len(char_list)))
np.save("./char2id.npy",tokenizer.word_index)
import json
with open('./char2id.json', 'w') as f:
json.dump(tokenizer.word_index, f, ensure_ascii=False)
return tokenizer
#df = pd.read_csv("./eda_class_data.csv",sep="\t")
re_tokenize(df["content"])
聚合查询
df.groupby(["category"])[["id"]].count().sort_values(by='id')
列元素建立映射
id2doctype=dict(zip(df['id'],df["doctype"]))
去除某一列
primary_data_class_cleared.drop(['Unnamed: 0'], axis=1, inplace=True)