def mc_chiMerge_final(df,var,target,max_groups=None,threshold=None):
"""
df: 数据集
var: 变量
target: 标签
max_groups: 最大分箱个数
threshold: 卡方阈值
"""
import numpy as np
import pandas as pd
def mc_chi2(arr):
#arr:频数统计表
assert(arr.ndim==2)
#计算每行频数
R_N = arr.sum(axis=1)
#每列总频数
C_N = arr.sum(axis=0)
#总频数
N = arr.sum()
#计算期望频数C_i*R_j/N
E = np.ones(arr.shape) * C_N /N
E = (E.T * R_N).T
square = (arr-E)**2/E
#期望频数为0时,无法作为除数,不计入卡方值
square[E==0] = 0
#卡方值
v = square.sum()
return v
freq_tab = pd.crosstab(df[var],df[target])
#转成numpy数组用于计算
freq = freq_tab.values
#初始分组切分点,每个变量值都是切分点;每组中只包含一个变量值.
#分组区间是左闭右开的,如cutoffs = [1,2,3],则表示区间 [1,2) , [2,3) ,[3,3+)
cutoffs = freq_tab.index.values
#如果没有指定最大分组
if max_groups is None:
#如果没有指定卡方阈值,就以95%的置信度(自由度为类数目-1)设定阈值。
if threshold is None:
#类数目
cls_num = freq.shape[-1]
threshold = mc_chi2.isf(0.05,df= cls_num - 1)
while True:
minvalue = None
minidx = None
#从第1组开始,依次取两组计算卡方值,并判断是否小于当前最小卡方值
for i in range(len(freq) - 1):
v = mc_chi2(freq[i:i+2])
if minvalue is None or minvalue>v: #小于当前最小卡方值,更新最小值
minvalue = v
minidx = i
#如果最小卡方值小于阈值,则合并最小卡方值的相邻两组,并继续循环
if(max_groups is not None and max_groups<len(freq)) or (threshold is not None and minvalue<threshold):
#minidx后一行合并到minidx
tmp = freq[minidx] + freq[minidx+1]
freq[minidx] = tmp
#删除minidx最后一行
freq = np.delete(freq,minidx+1,0)
#删除对应的切分点
cutoffs = np.delete(cutoffs,minidx+1,0)
else:
#最小卡方值不小于阈值,停止合并
break
def mc_valuegroup_1(x,cutoffs):
num_groups = len(cutoffs)
if pd.isna(x):
return -999
for i in range(1,num_groups):
if cutoffs[i-1]<=x<cutoffs[i]:
return i-1
return len(cutoffs)-1
#判断数值所属区间
df["var_bin_count"] = df[var].apply(mc_valuegroup_1,args = (cutoffs,))
#分组排序
w1 = pd.DataFrame(df.groupby("var_bin_count").var_bin_count.count())
w1["group_count"] = w1.index.values
for i in range(0,len(cutoffs)):
if i<len(cutoffs)-1:
w1.loc[i,"var_bin"] = "[" + str(cutoffs[w1.loc[i,"group_count"]]) + "," + str(cutoffs[w1.loc[i+1,"group_count"]]) + ")"
else:
w1.loc[i,"var_bin"] = "[" + str(cutoffs[w1.loc[i,"group_count"]]) + "," + "+" + ")"
try:
w1.loc[-999,"var_bin"] = w1.loc[-999,"group_count"]
except KeyError:
print('the feature nullvalue num is %d ' % (df[var].isnull().sum()))
w1 = w1.sort_values(by = 'group_count')[["var_bin","var_bin_count"]]
w2 = pd.crosstab(df["var_bin_count"],df[target]).sort_index()
w3 = pd.merge(w1,w2,left_index = True ,right_index = True ,how = 'left')
w4 = w3.rename(columns = {"var_bin":"分箱区间","var_bin_count":"总样本数",0:"正样本数",1:"负样本数"})
#分布统计
dt = w4
dt["行占比"] = dt["总样本数"]/df.shape[0]
dt["逾期率"] = dt["负样本数"]/dt["总样本数"]
#woe&iv
good_sum = sum(dt.正样本数)
bad_sum = sum(dt.负样本数)
good_rate = dt.正样本数/sum(dt.正样本数)
bad_rate = dt.负样本数/sum(dt.负样本数)
rate = good_rate/bad_rate
dt['woe'] = np.log(rate.astype('float'))
dt['iv'] = (good_rate-bad_rate)*np.log(rate.astype('float')) #加上as.type
dt = dt.replace({'woe': {np.inf: 0, -np.inf: 0},'iv': {np.inf: 0, -np.inf: 0}})
#添加合计行
dt1 = pd.DataFrame()
dt1.loc[0,"分箱区间"] = "合计"
dt1.loc[0,"总样本数"] = df[target].count()
dt1.loc[0,"正样本数"] = df[df[target]==0][target].count()
dt1.loc[0,"负样本数"] = df[df[target]==1][target].count()
dt1.loc[0,"行占比"] = df[target].count()/df.shape[0]
dt1.loc[0,"逾期率"] = dt1.loc[0,"负样本数"]/dt1.loc[0,"总样本数"]
dt1.loc[0,"woe"] = "-"
dt1.loc[0,"iv"] = "-"
#修改列名顺序
order = ['分箱区间', '总样本数', '正样本数', '负样本数', '行占比', '逾期率', 'woe', 'iv']
dt = dt[order]
dt1 = dt1[order]
#汇总表
dt_bin = dt.append(dt1)
#修改序列号
dt_bin.index = range(1,len(dt_bin)+1)
#添加变量名
dt_bin["变量名"] = var
order = ['变量名','分箱区间', '总样本数', '正样本数', '负样本数', '行占比', '逾期率', 'woe', 'iv']
dt_bin = dt_bin[order]
#分箱后求和IV
dt_bin.iloc[-1,-1] = dt_bin.loc[1:dt_bin.shape[0]-2,"iv"].sum()
#指标有效性
dt_summary = pd.DataFrame()
dt_summary.loc[0,"变量名"] = var
dt_summary.loc[0,"覆盖率"] = (df.shape[0]-pd.isna(df[var]).sum())/df.shape[0]
dt_summary.loc[0,"IV"] = dt_bin.loc[1:dt_bin.shape[0]-2,"iv"].sum()
return dt_bin, dt_summary
自动分箱的代码实现(基于卡方)
最新推荐文章于 2024-08-28 15:57:39 发布