# 决策树实例 # import numpy as np from sklearn.feature_extraction import DictVectorizer # from sklearn.model_selection import train_test_split import csv from sklearn import tree from sklearn import preprocessing # from six import StringIO import graphviz # Read in the csv file and put features into list of dict and list of class label # 从csv文件中读取数据 并且打印标题栏 allElectronicsData = open(r'../file/AllElectronics.csv', 'rt') reader = csv.reader(allElectronicsData) headers = next(reader) print(headers) # 并将原始数据中的数据转换为数字形式 # axis=1,代表列,是要把data数据集中的所有数据按第四、五列之间分割为X集和Y集 # x, y = np.split(data,(4,),axis=1) # 拆分训练数据与测试数据,为了进行交叉验证 # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=2) # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) featureList = [] labelList = [] # 将特征向量和标签分别放在featureList和labelList中 for row in reader: labelList.append(row[len(row)-1]) rowDict = {} for i in range(1, len(row)-1): rowDict[headers[i]] = row[i] featureList.append(rowDict) print('labellist:', labelList) print('featureList:', featureList ) # 对字典列表featureList进行转换,转换成特征矩阵 vec = DictVectorizer() dummyX = vec.fit_transform(featureList) .toarray() # 可以通过inverse_transform将特征矩阵还原成原始数据 # print(vec.inverse_transform(dummyX) == featureList) # 特征矩阵行代表数据,列代表特征,0表示该数据没有该特征 print("dummyX: " + str(dummyX)) # 获取打印特征列名 print(vec.get_feature_names_out()) print("labelList: " + str(labelList)) # 将标签二值化 lb = preprocessing.LabelBinarizer(sparse_output=False) dummyY = lb.fit_transform(labelList) print("dummyY: " + str(dummyY)) # Using decision tree for classification # 使用信息熵作为划分标准,对决策树进行训练 clf = tree.DecisionTreeClassifier(criterion='entropy') clf = clf.fit(dummyX, dummyY) print("clf: " + str(clf)) # 把决策树结构写入文件 #with open("allElectronicInformationGainOri.dot", 'w') as f: f = tree.export_graphviz(clf, feature_names=vec.get_feature_names_out(), filled=True, rounded=True) # 系数反映每个特征的影响力。越大表示该特征在分类中起到的作用越大 print('feature_importances_:', clf.feature_importances_) # 构造一个测试样本 预测输出 oneRowX = dummyX[0, :] print("oneRowX: " + str(oneRowX)) newRowX = oneRowX newRowX[0] = 1 newRowX[2] = 0 newRowX = [newRowX] print("newRowX: " + str(newRowX)) predictedY = clf.predict(newRowX).reshape(-1, 1) print("predictedY: " + str(predictedY)) # 有测试样本的情况下使用下面语句测试 # score = clf.score(x_test, y_test) # 返回预测的精确度accuracy # print(score) graph = graphviz.Source(f) # 有中文字符的话使用下面语句 # graph = graphviz.Source(f.replace('helvetica', '"Microsoft YaHei"'), encoding='utf-8') graph.render('AllElectronics')
决策树实例
最新推荐文章于 2023-12-15 21:48:15 发布