#1引入对应的软件包
import numpy as np
import pandas as pd
import sklearn.metrics
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import _tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.feature_extraction import DictVectorizer
import re
#2模型搭建
#2.1数据获取
labeled_file = r"D:\金融集团\分析-体系报表\20190927-钢镚bc区分建模\决策树模型\train_data.csv"
unlabeled_file = r"D:\金融集团\分析-体系报表\20190927-钢镚bc区分建模\决策树模型\test_data.csv"
rule_dot = r'D:\金融集团\分析-体系报表\20190927-钢镚bc区分建模\决策树模型\iris_tree.dot' #读取包含决策树规则的dot文件
treefile = r'D:\金融集团\分析-体系报表\20190927-钢镚bc区分建模\决策树模型\iris_tree.plk' #保存模型,可用于load后预测其他数据集
labeled_data = pd.read_csv(labeled_file)# 已知类别的样本集
labeled_data["result_1"] = labeled_data["result_1"].astype("int")
labeled_data = labeled_data.fillna(0)
unlabeled_data = pd.read_csv(unlabeled_file) # 未知类别的样本集
unlabeled_data = unlabeled_data.drop(["result_1"], axis=1).values
#2.2划分训练集测试集
y = labeled_data["result_1"].values
x = labeled_data.drop(["result_1"],axis = 1).values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=13)
#对几个关键参数设置迭代步长,取最优参数。此处最优参数选取f1-score的原则确定。
max_depth_ = range(1,9,1) # 这里是调参数的地方
min_samples_leaf_ = range(1,9,1) # 这里是调参数的地方
min_impurity_decrease_ = np.linspace(0,1,10) # 这里是调参数的地方
search_grid = {
"max_depth":max_depth_,
"min_samples_leaf" :min_samples_leaf_,

最低0.47元/天 解锁文章
1101

被折叠的 条评论
为什么被折叠?



