机器学习手撕代码(2)决策树及随机森林
- 本篇分享一下决策树及随机森林的代码,DTandRF.py为决策树以及随机森林两个模型的代码。utils.py中为可视化结果的工具。
- dataset见本系列第0篇。
DTandRF.py
import numpy as np
from datasets.dataset import DataSet
from sklearn.model_selection import train_test_split
from utils import Visualization,Metrics
def Gini(targets):
N = len(targets)+1e-5
count = np.bincount(targets)
return (1-((np.sum(count**2))/(N**2)))
class Node:
def __init__(self,max_sections=10):
self.type = None
self.threshold = None
self.feat = None
self.left = None
self.right = None
self.label = None
self.max_sections = max_sections # 寻找最优阈值的时候最多尝试max_sections次
def fit(self,data,targets,stop_n):
count = np.bincount(targets)
condition = len(data)<=stop_n or np.max(count)/np.sum(count)>0.99 # 是否终止分裂
if condition: # 叶节点,选择流入当前节点的数据中类别最多的类作为本叶子节点的标签
self.type = 'leaf'
self.label = np.argmax(count)
else:
self.type = 'root'
best_feat_i = 0
best_hold = 0
best_l_i = None
best_r_i = None
G = None
for i in range(len(data[0])):
feat_i = data[:,i]
# 如果流入当前节点的数据小于max_sections,则阈值在这些数据中寻找,否则等距取# 如果流入当前节点的数据小于max_sections个数并寻找最优阈值
if len(targets)<=self.max_sections:
holds = feat_i
else:
holds = np.linspace(np.min(feat_i),np.max(feat_i),self.max_sections)
for hold in holds:
l_i = (feat_i<=hold)
r_i = (feat_i>hold)
new_g = (len(l_i)*Gini(targets[l_i])+len(r_i)*Gini(targets[r_i]))/len(targets)
if G is None or new_g<G:
G = new_g
best_feat_i = i
best_hold = hold
best_l_i = l_i
best_r_i = r_i
self.feat = best_feat_i
self.threshold = best_hold
self.left = Node(self.max_sections) # 递归建立左子树
self.left.fit(data[best_l_i],targets[best_l_i],stop_n)
self.</

本文详细介绍了如何使用Python实现决策树和随机森林模型,包括Gini系数的计算、节点划分过程和模型训练、预测及评估。通过实例演示了如何在winequalityN.csv数据集上运行决策树和随机森林,并展示了可视化结果。
最低0.47元/天 解锁文章
9571

被折叠的 条评论
为什么被折叠?



