最近在整理有关风控建模的相关的代码,昨天在知乎上看见某大神写的一个连续型变量分箱代码,这里我进行分析以及汇总,其整理如下,将其整理出来分享给大家,仅供参考:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
def _optimal_binning_boundary(x: pd.Series, y: pd.Series, nan: float = -999.) -> list:
'''
利用决策树获得最优分箱的边界值列表
'''
boundary = [] # 待return的分箱边界值列表
x = x.fillna(nan).values # 填充缺失值
y = y.values
clf = DecisionTreeClassifier(criterion='gini', #“基尼系数”最小化准则划分
max_leaf_nodes=6, # 最大叶子节点数
min_samples_leaf=0.05) # 叶子节点样本数量最小占比
clf.fit(x.reshape(-1, 1), y) # 训练决策树
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
threshold = clf.tree_.threshold
for i in range(n_nodes):
if children_left[i] != children_right[i]: # 获得决策树节点上的划分边界值
boundary.appe