import operator
import csv
import numpy as np
def readDataset(filename):
'''
读取数据
:param filename: 数据文件名,CSV格式
:return: 以列表形式返回数据列表和特征列表
'''
with open(filename) as f:
reader = csv.reader(f)
header_row = next(reader)
labels = header_row[1:7]
dataset = []
for line in reader:
tempVect = line[1:]
dataset.append(tempVect)
trainIndex = [1, 2, 3, 6, 7, 10, 14, 15, 16, 17]
trainDataset = []
testDataset = []
for i in range(1, 18):
if (i in trainIndex):
trainDataset.append(dataset[i - 1])
else:
testDataset.append(dataset[i - 1])
trainDataset.append(dataset[3]) # 为保持和书中结果相同,训练集中增加第四条数据
return dataset, labels, trainDataset, testDataset
def Gini(dataset):
'''
计算gini基尼值
:param dataset: 输入数据集
:return: 返回基尼值gini
'''
numdata = len(dataset)
labels = {}
for featVec in dataset:
label = featVec[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
gini = 1
for lab in labels.keys():
prop = float(labels[lab]) / numdata
gini -= prop ** 2
return gini
def splitDataset(dataset, axis, value):
'''
对某个特征进行划分后的数据集
:param dataset: 数据集
:param axis: 划分属性的下标
:param value: 划分属性值
:return: 返回剩余数据集
'''
restDataset = []
for featVec in dataset:
if featVec[axis] == value:
restFeatVec = featVec[:axis]
restFeatVec.extend(featVec[axis + 1:])
restDataset.append(re
西瓜书 课后习题4.4 基尼指数 未剪枝 预剪枝 后剪枝
最新推荐文章于 2023-07-29 21:53:08 发布