基于树结构的机器学习模型

原创已于 2023-04-20 20:29:33 修改 · 2.3k 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#决策树

于 2019-11-10 18:09:24 首次发布

机器学习专栏收录该内容

13 篇文章

订阅专栏

基于树结构的机器学习模型

在深度学习被广泛应用之前，基于树形结构的机器学习模型，比如说决策树，随机森林，GBDT，Xgboost等等被广泛的应用到分类等常见场景中，下面总结一下常见一个一些树形结构的机器学习模型。

1：决策树

常见的决策树算法有ID3，C4.5，CART(classification and regression tree)等等。由于存在很多好的文章已经详细的介绍了这些算法，如深入浅出理解决策树算法（二）-ID3算法与C4.5算法等等，这里主要写出我的一些理解。

ID3算法使用的信息增益：

其实决策树中所言的information gain就是属性与类的互信息。由于ID3算法的信息增益计算出来后，那些具有更多可选取值的属性带来的信息增益往往更大，因此倾向于选择这些属性。但是存在一个问题，根据这些属性划分的节点往往包含的个数比较少，带来的效果不好。类比，这种现象有点像图割算法，MinCut最初的目标函数往往会将孤立的点划分出来，因此后面的NormalizedCut在目标函数中将子图的大小考虑进来，解决了这个问题。

后来的C4.5算法使用信息增益率作为划分节点的标准，可以看出，正则化系数IV实际上是属性个数的熵，属性个数越多，熵越大，因此也是一个正则化。

class DecitionTree():
    """This is a decision tree classifier. """
    
    def __init__(self, criteria='ID3'):
        self._tree = None
        if criteria == 'ID3' or criteria == 'C4.5':
            self._criteria = criteria
        else:
            raise Exception("criterion should be ID3 or C4.5")
    
    def _calEntropy(slef, y):
        '''
        功能：_calEntropy用于计算香农熵 e=-sum(pi*log pi)
        参数：其中y为数组array
        输出：信息熵entropy
        '''
        n = y.shape[0]  
        labelCounts = {}
        for label in y:
            if label not in labelCounts.keys():
                labelCounts[label] = 1
            else:
                labelCounts[label] += 1
        entropy = 0.0
        for key in labelCounts:
            prob = float(labelCounts[key])/n
            entropy -= prob * np.log2(prob)
        return entropy
    
    def _splitData(self, X, y, axis, cutoff):
        """
        参数：X为特征,y为label,axis为某个特征的下标,cutoff是下标为axis特征取值值
        输出：返回数据集中特征下标为axis，特征值等于cutoff的子数据集
        """
        ret = []
        featVec = X[:,axis]
        n = X.shape[1]      #特征个数
        X = X[:,[i for i in range(n) if i!=axis]]
        for i in range(len(featVec)):
            if featVec[i] == cutoff:
                ret.append(i)
        return X[ret, :], y[ret]   
            
    def _chooseBestSplit(self, X, y):
        """ID3 & C4.5
        参数：X为特征，y为label
        功能：根据信息增益或者信息增益率来获取最好的划分特征
        输出：返回最好划分特征的下标
        """
        numFeat = X.shape[1]
        baseEntropy = self._calEntropy(y)
        bestSplit = 0.0
        best_idx  = -1
        for i in range(numFeat):
            featlist = X[:,i]   #得到第i个特征对应的特征列
            uniqueVals = set(featlist)
            curEntropy = 0.0
            splitInfo = 0.0
            for value in uniqueVals:
                sub_x, sub_y = self._splitData(X, y, i, value)
                prob = len(sub_y)/float(len(y))      #计算某个特征的某个值的概率
                curEntropy += prob * self._calEntropy(sub_y)    #迭代计算条件熵
                splitInfo -=  prob * np.log2(prob) #分裂信息，用于计算信息增益率
            IG = baseEntropy - curEntropy
            if self._criteria=="ID3":
                if IG > bestSplit:
                    bestSplit = IG
                    best_idx = i
            if self._criteria=="C4.5":
                if splitInfo == 0.0:
                    pass
                IGR = IG/splitInfo
                if IGR > bestSplit:
                    bestSplit = IGR
                    best_idx = i
        return best_idx
        
    def _majorityCnt(self, labellist):
        """
        参数:labellist是类标签，序列类型为list
        输出：返回labellist中出现次数最多的label
        """
        labelCount={}
        for vote in labellist:
            if vote not in labelCount.keys(): 
                labelCount[vote] = 0
            labelCount[vote] += 1
        sortedClassCount = sorted(labelCount.iteritems(), key=lambda x:x[1], \
                                     reverse=True)
        return sortedClassCount[0][0]

    def _createTree(self, X, y, featureIndex):
        """
        参数:X为特征,y为label,featureIndex类型是元组，记录X特征在原始数据中的下标
        输出:根据当前的featureIndex创建一颗完整的树
        """
        labelList = list(y)
        if labelList.count(labelList[0]) == len(labelList): 
            return labelList[0]
        if len(featureIndex) == 0:
            return self._majorityCnt(labelList)
        bestFeatIndex = self._chooseBestSplit(X,y)
        bestFeatAxis = featureIndex[bestFeatIndex]
        featureIndex = list(featureIndex)
        featureIndex.remove(bestFeatAxis)
        featureIndex = tuple(featureIndex)
        myTree = {bestFeatAxis:{}}
        featValues = X[:, bestFeatIndex]
        uniqueVals = set(featValues)
        for value in uniqueVals:
            #对每个value递归地创建树
            sub_X, sub_y = self._splitData(X,y, bestFeatIndex, value)
            myTree[bestFeatAxis][value] = self._createTree(sub_X, sub_y, \
                                            featureIndex)
        return myTree  
        
    def fit(self, X, y):
        """
        参数：X是特征，y是类标签
        注意事项：对数据X和y进行类型检测，保证其为array
        输出：self本身
        """
        if isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
            pass
        else:
            try:
                X = np.array(X)
                y = np.array(y)
            except:
                raise TypeError("numpy.ndarray required for X,y")
        featureIndex  = tuple(['x'+str(i) for i in range(X.shape[1])])
        self._tree = self._createTree(X,y,featureIndex)
        return self  #allow using: clf.fit().predict()
        
    def _classify(self, tree, sample):
        """
        用训练好的模型对输入数据进行分类 
        注意：决策树的构建是一个递归的过程，用决策树分类也是一个递归的过程
        _classify()一次只能对一个样本（sample）分类
        """
        featIndex = tree.keys()[0] #得到数的根节点值
        secondDict = tree[featIndex] #得到以featIndex为划分特征的结果
        axis=featIndex[1:] #得到根节点特征在原始数据中的下标
        key = sample[int(axis)] #获取待分类样本中下标为axis的值
        valueOfKey = secondDict[key] #获取secondDict中keys为key的value值
        if type(valueOfKey).__name__=='dict': #如果value为dict，则继续递归分类
            return self._classify(valueOfKey, sample)
        else: 
            return valueOfKey
        
    def predict(self, X):
        if self._tree==None:
            raise NotImplementedError("Estimator not fitted, call `fit` first")
        #对X的类型进行检测，判断其是否是数组
        if isinstance(X, np.ndarray): 
            pass
        else: 
            try:
                X = np.array(X)
            except:
                raise TypeError("numpy.ndarray required for X")
            
        if len(X.shape) == 1:
            return self._classify(self._tree, X)
        else:
            result = []
            for i in range(X.shape[0]):
                value = self._classify(self._tree, X[i])
                print str(i+1)+"-th sample is classfied as:", value 
                result.append(value)
            return np.array(result)

    def show(self, outpdf):
        if self._tree==None:
            pass
        #plot the tree using matplotlib
        import treePlotter
        treePlotter.createPlot(self._tree, outpdf)
    
if __name__=="__main__":
    trainfile=r"data\train.txt"
    testfile=r"data\test.txt"
    import sys
    sys.path.append(r"F:\CSU\Github\MachineLearning\lib")  
    import dataload as dload
    train_x, train_y = dload.loadData(trainfile)
    test_x, test_y = dload.loadData(testfile)
    
    clf = DecitionTree(criteria="C4.5")
    clf.fit(train_x, train_y)
    result = clf.predict(test_x)    
    outpdf = r"tree.pdf"
    clf.show(outpdf)

上面的代码来自于decisionTree，是比较简单的ID3,C4.5实现。可以知道，ID3、C4.5只用于分类任务，因为不管是信息增益还是信息增益率都是基于类别标签计算的。而CART树既可以做分类，也可以做回归，下面是CART分类树的简单的实现代码，来自于How To Implement The Decision Tree Algorithm From Scratch In Python - MachineLearningMastery.com。

# CART on the Bank Note dataset
from random import seed
from random import randrange
from csv import reader

# Load a CSV file
def load_csv(filename):
	file = open(filename, "rb")
	lines = reader(file)
	dataset = list(lines)
	return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
	# count all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / n_instances)
	return gini

# Select the best split point for a dataset
def get_split(dataset):
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	for index in range(len(dataset[0])-1):
		for row in dataset:
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value
def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)

# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# process left child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left)
		split(node['left'], max_depth, min_size, depth+1)
	# process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right)
		split(node['right'], max_depth, min_size, depth+1)

# Build a decision tree
def build_tree(train, max_depth, min_size):
	root = get_split(train)
	split(root, max_depth, min_size, 1)
	return root

# Make a prediction with a decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']

# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
	tree = build_tree(train, max_depth, min_size)
	predictions = list()
	for row in test:
		prediction = predict(tree, row)
		predictions.append(prediction)
	return(predictions)

# Test CART on Bank Note dataset
seed(1)
# load and prepare data
filename = 'data_banknote_authentication.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)
# evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(dataset, decision_tree, n_folds, max_depth, min_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

上面的CART数是分类树，使用的Gini系数作为属性选择的指标，要是是回归树的话，使用mse作为指标就行了。 ML-From-Scratch上有一个更好的决策树以及回归树的实现方法。

2：GBDT

梯度提升（Gradient boost）树的基本概念就是建立多棵树，分别将一个待分类的数据x输入到这些树中，将得到的结果进行累加求和（加法模型），和随机森林（bagging）使用投票的方法不同。

那么怎么选择f呢？梯度提升树使用损失函数的负梯度的值作为当前叶子结点的值（如果非叶子节点，那么会继续分割，因此不存在值），注意梯度是关于上一次（当前）的预测值 $\hat{y}_{i}^{t-1}$ 得到的，GBDT一般用于回归的时候使用的是MSE作为损失函数（损失函数实际上就是在选择分割属性的时候，使用损失函数衡量所选属性及其具体值的好坏），分类的时候使用交叉熵损失函数，除此之外，分类的时候会把离散的标签(如category)转换成onehot当做回归来做。下面是GBDT的代码，来自于https://github.com/eriklindernoren/ML-From-Scratch。

class GradientBoosting(object):
    """Super class of GradientBoostingClassifier and GradientBoostinRegressor. 
    Uses a collection of regression trees that trains on predicting the gradient
    of the loss function. 
    Parameters:
    -----------
    n_estimators: int
        The number of classification trees that are used.
    learning_rate: float
        The step length that will be taken when following the negative gradient during
        training.
    min_samples_split: int
        The minimum number of samples needed to make a split when building a tree.
    min_impurity: float
        The minimum impurity required to split the tree further. 
    max_depth: int
        The maximum depth of a tree.
    regression: boolean
        True or false depending on if we're doing regression or classification.
    """
    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.max_depth = max_depth
        self.regression = regression
        self.bar = progressbar.ProgressBar(widgets=bar_widgets)
        
        # Square loss for regression
        # Log loss for classification
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = CrossEntropy()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = RegressionTree(
                    min_samples_split=self.min_samples_split,
                    min_impurity=min_impurity,
                    max_depth=self.max_depth)
            self.trees.append(tree)


    def fit(self, X, y):
        y_pred = np.full(np.shape(y), np.mean(y, axis=0))
        for i in self.bar(range(self.n_estimators)):
            gradient = self.loss.gradient(y, y_pred)
            self.trees[i].fit(X, gradient)
            update = self.trees[i].predict(X)
            # Update y prediction
            y_pred -= np.multiply(self.learning_rate, update)


    def predict(self, X):
        y_pred = np.array([])
        # Make predictions
        for tree in self.trees:
            update = tree.predict(X)
            update = np.multiply(self.learning_rate, update)
            y_pred = -update if not y_pred.any() else y_pred - update

        if not self.regression:
            # Turn into probability distribution
            y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
            # Set label to the value that maximizes probability
            y_pred = np.argmax(y_pred, axis=1)
        return y_pred


class GradientBoostingRegressor(GradientBoosting):
    def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,
                 min_var_red=1e-7, max_depth=4, debug=False):
        super(GradientBoostingRegressor, self).__init__(n_estimators=n_estimators, 
            learning_rate=learning_rate, 
            min_samples_split=min_samples_split, 
            min_impurity=min_var_red,
            max_depth=max_depth,
            regression=True)

class GradientBoostingClassifier(GradientBoosting):
    def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,
                 min_info_gain=1e-7, max_depth=2, debug=False):
        super(GradientBoostingClassifier, self).__init__(n_estimators=n_estimators, 
            learning_rate=learning_rate, 
            min_samples_split=min_samples_split, 
            min_impurity=min_info_gain,
            max_depth=max_depth,
            regression=False)

    def fit(self, X, y):
        y = to_categorical(y)
        super(GradientBoostingClassifier, self).fit(X, y)

3：xgboost

xgboost实际上也是梯度提升树，只不过作者在实现的时候加入了很多特性，比如说分布式求解之类的，这些特性我暂时了解的不深，在这里主要是对xgboost的基本算法进行一些说明。

定义函数的二阶泰勒展开：

梯度提升树的损失函数可以写作：

其中第一项是偏差，第二项是正则项。我们将第一项损失函数进行泰勒展开可以得到：

因为 $l=\left ( y_{i},y_{i}^{t-1} \right )$ 是常数，所以可以进一步简化为：

把上式再次进行一些变换可以得到：

上式存在闭合解：

其中左边一项用于替代GBDT中的负梯度作为叶子结点的值，右边一项作为目标函数用于衡量节点划分的好坏。

class LogisticLoss():
    def __init__(self):
        sigmoid = Sigmoid()
        self.log_func = sigmoid
        self.log_grad = sigmoid.gradient

    def loss(self, y, y_pred):
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        p = self.log_func(y_pred)
        return y * np.log(p) + (1 - y) * np.log(1 - p)

    # gradient w.r.t y_pred
    def gradient(self, y, y_pred):
        p = self.log_func(y_pred)
        return -(y - p)

    # w.r.t y_pred
    def hess(self, y, y_pred):
        p = self.log_func(y_pred)
        return p * (1 - p)


class XGBoost(object):
    """The XGBoost classifier.
    Reference: http://xgboost.readthedocs.io/en/latest/model.html
    Parameters:
    -----------
    n_estimators: int
        The number of classification trees that are used.
    learning_rate: float
        The step length that will be taken when following the negative gradient during
        training.
    min_samples_split: int
        The minimum number of samples needed to make a split when building a tree.
    min_impurity: float
        The minimum impurity required to split the tree further. 
    max_depth: int
        The maximum depth of a tree.
    """
    def __init__(self, n_estimators=200, learning_rate=0.001, min_samples_split=2,
                 min_impurity=1e-7, max_depth=2):
        self.n_estimators = n_estimators            # Number of trees
        self.learning_rate = learning_rate          # Step size for weight update
        self.min_samples_split = min_samples_split  # The minimum n of sampels to justify split
        self.min_impurity = min_impurity              # Minimum variance reduction to continue
        self.max_depth = max_depth                  # Maximum depth for tree

        self.bar = progressbar.ProgressBar(widgets=bar_widgets)
        
        # Log loss for classification
        self.loss = LogisticLoss()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = XGBoostRegressionTree(
                    min_samples_split=self.min_samples_split,
                    min_impurity=min_impurity,
                    max_depth=self.max_depth,
                    loss=self.loss)

            self.trees.append(tree)

    def fit(self, X, y):
        y = to_categorical(y)

        y_pred = np.zeros(np.shape(y))
        for i in self.bar(range(self.n_estimators)):
            tree = self.trees[i]
            y_and_pred = np.concatenate((y, y_pred), axis=1)
            tree.fit(X, y_and_pred)
            update_pred = tree.predict(X)

            y_pred -= np.multiply(self.learning_rate, update_pred)

    def predict(self, X):
        y_pred = None
        # Make predictions
        for tree in self.trees:
            # Estimate gradient and update prediction
            update_pred = tree.predict(X)
            if y_pred is None:
                y_pred = np.zeros_like(update_pred)
            y_pred -= np.multiply(self.learning_rate, update_pred)

        # Turn into probability distribution (Softmax)
        y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred), axis=1, keepdims=True)
        # Set label to the value that maximizes probability
        y_pred = np.argmax(y_pred, axis=1)
        return y_pred

备注：

和神经网络相比，基于树模型的算法的优点在于，不需要进行一些额外的特征预处理，比如说scaling（正则化，归一化）等，不同维度特征之间的scale不会对结果进行影响，除此之外，树模型更加好解释，神经网络的解释性差。