import pandas as pd import numpy as np def ceshi(df,col,target,max_interval=5): colLevels = set(df[col]) #去重 colLevels = sorted(list(colLevels)) N_distinct = len(colLevels) total = df[target].count() # 计算总样本数 good = df[target].sum() # 计算好样本总数 bad = total - good # 计算坏样本总数 total_bin = df.groupby([col])[target].count() # 计算每个箱体总样本数 total_bin_table = pd.DataFrame({'total': total_bin}) # 创建一个数据框保存结果 good_bin = df.groupby([col])[target].sum() # 计算每个箱体的坏样本数 good_bin_table = pd.DataFrame({'good': good_bin}) # 创建一个数据框保存结果 regroup = pd.merge(total_bin_table, good_bin_table, left_index=True, right_index=True, how='inner') # 组合total_bin 和 bad_bin regroup.reset_index(inplace=True) regroup['bad'] = regroup['total'] - regroup['good'] # 计算每个箱体的好样本数 regroup = regroup.drop(['total'], axis=1) # 删除total np_regroup = np.array(regroup) # 将regroup转为numpy i = 0 while (i <= np_regroup.shape[0] - 2): if ((np_regroup[i, 1] == 0) or (np_regroup[i, 2] == 0)): np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1] # 正样本 np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2] # 负样本 np_regroup[i, 0] = np_regroup[i + 1, 0] np_regroup = np.delete(np_regroup, i + 1, 0) i = i - 1 i = i + 1 np_regroup=np_regroup while ( np_regroup.shape[0]> max_interval): chisqList = [] for i in np.arange(np_regroup.shape[0] - 1): chi = ((np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 * \ (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2])) / \ ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * \ (np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2])) chisqList.append(chi) min_position = chisqList.index(min(chisqList)) #寻找最小卡方值 combinedPosition = min_position + 1 #要合并的箱为最小值索引+1 np_regroup[min_position]=np_regroup[min_position]+np_regroup[combinedPosition] #将两箱合并 np_regroup[min_position][0]=np_regroup[combinedPosition][0] #合并后,最小箱的类别值为两个箱相加的和,所以要用合并箱的类别替换最小箱位置的类别 np_regroup=np.delete(np_regroup,combinedPosition,axis=0) #删除合并箱 np_regroup=np_regroup b=[x[0] for x in np_regroup] print(b) return b #单调性,计算每个箱中,坏样本占本箱内所有样本的比例 def BadRateMonotone(df, sortByVar, target): #df[sortByVar]这列已经经过分箱 df2=df.sort_values(by=[sortByVar]) total=df2.groupby([sortByVar])[target].count() total1=pd.DataFrame({'total':total}) print(total1) good=df2.groupby([sortByVar])[target].sum() #good1=pd.DataFrame({'good':good}) bad=total-good bad=pd.DataFrame({'bad':bad}) regroup=total1.merge(bad, left_index=True, right_index=True, how='left') regroup.reset_index(level=0, inplace=True) combined=zip(regroup['total'], regroup['bad']) badRate=[x[1]*1.0/x[0] for x in combined] badRateMonotone=[badRate[i]>badRate[i+1] for i in range(len(badRate)-1)] Monotone = len(set(badRateMonotone)) if Monotone==1: return True else: return False data=pd.read_csv("e://yunying/testcsv123.csv") result=[] b=data.columns.size cols=data.columns.values.tolist() for i in range(1,b): b=ceshi(data,cols[i],cols[0],5) b.insert(0,-50) data1 = data.iloc[:, i] df2 = pd.cut(data1, b,right=True,labels=range(5)) #根据b的结果,将data1进行分箱 #将id列和df2合并为一个dataframe,然后用单调函数判断单调性,如果不单调,则输出那一列不单调并继续下一个i, id=data['id'].values.tolist() id=pd.DataFrame(data=id) df3=pd.DataFrame(data=df2) dandiao=id.join(df3) dandiaocol=dandiao.columns.values.tolist() #调用单调判断函数 isMonotone=BadRateMonotone(dandiao,dandiaocol[1],dandiaocol[0]) if isMonotone == False: print(cols[i]+' band error, reason is not monotone') continue result.append(df2) dt=pd.DataFrame(data=result) t=dt.transpose() t.to_csv("e://yunying/kafangcsv123.csv", encoding='utf-8', index=False) # regroup[col],这里col=cols[2],col应该是个文本,所以才能直接提取dataframe的列,一般提取列为regroup.iloc[:,1]