前言
接上篇:评分卡模型案例(GiveMeSomeCredit,kaggle数据)(自己练习版本)-优快云博客
通过评分卡,可以对数据记录进行评分。模型投产时,需要设定一个评分阈值,将低于评分阈值的客户拒绝。
一、计算信用评分
全量样本进行打分
# 本段代码的目的是把输入数据映射到分箱,并且选取相应的分值来计算最后的信用评分
def str_to_int(s):
if s == '-inf':
return -999999999.0
elif s=='inf':
return 999999999.0
else:
return float(s)
def map_value_to_bin(feature_value,feature_to_bin):
for idx, row in feature_to_bin.iterrows():
bins = str(row['Binning'])
left_open = bins[0]=="("
right_open = bins[-1]==")"
binnings = bins[1:-1].split(',')
in_range = True
# check left bound
if left_open:
if feature_value<= str_to_int(binnings[0]):
in_range = False
else:
if feature_value< str_to_int(binnings[0]):
in_range = False
#check right bound
if right_open:
if feature_value>= str_to_int(binnings[1]):
in_range = False
else:
if feature_value> str_to_int(binnings[1]):
in_range = False
if in_range:
return row['Binning']
return null
def map_to_score(df,score_card):
scored_columns = list(score_card['Variable'].unique())
score = 0
for col in scored_columns:
feature_to_bin = score_card[score_card['Variable']==col]
feature_value = df[col]
selected_bin = map_value_to_bin(feature_value,feature_to_bin)
selected_record_in_scorecard = feature_to_bin[feature_to_bin['Binning'] == selected_bin]
score += selected_record_in_scorecard['Score'].iloc[0]
return score
def calculate_score_with_card(df,score_card,A):
df['score'] = df.apply(map_to_score,args=(score_card,),axis=1)
df['score'] = df['score']+A
df['score'] = df['score'].astype(int)
return df
df_score001 = calculate_score_with_card(df_train,score_card,A)
二、计算阈值表
#----阈值表计算函数-----
def cal_score_threshold_tb(score_df,bin_step=10,is_bad_col_name='SeriousDlqin2yrs',score_col_name='score'):
# -----计算分组起始结束字段--------------
bin_start = math.trunc(score_df[score_col_name].min()/bin_step)*bin_step
bin_end = math.trunc(score_df[score_col_name].max()/bin_step+1)*bin_step
score_thd = pd.DataFrame(columns=['分组名称','本组客户','本组好客户','本组坏客户'])
#-----统计分组内的好坏客户个数-------
for cur_bin in range(bin_start,bin_end,bin_step):
cur_bin_name ='['+str(cur_bin)+'-'+str(cur_bin+bin_step)+')'
cur_score_df = score_df[(score_df[score_col_name]>=cur_bin)&(score_df[score_col_name]<cur_bin+bin_step)][is_bad_col_name]
bad_cn = cur_score_df.sum()
cn = cur_score_df.shape[0]
score_thd.loc[score_thd.shape[0]]=[cur_bin_name,cn,cn-bad_cn,bad_cn]
#------计算阈值表其它字段-------------------
score_thd['总客户'] = score_thd['本组客户'].sum()
score_thd['总好客户'] = score_thd['本组好客户'].sum()
score_thd['总坏客户'] = score_thd['本组坏客户'].sum()
score_thd['阈值'] = score_thd['分组名称'].apply(lambda x: '<'+x.split('-')[1].replace(')',''))
score_thd['损失客户'] = score_thd['本组客户'].cumsum()
score_thd['损失客户%'] = score_thd['损失客户']/score_thd['总客户']
score_thd['损失好客户'] = score_thd['本组好客户'].cumsum()
score_thd['损失好客户%'] = score_thd['损失好客户']/score_thd['总好客户']
score_thd['剔除坏客户'] = score_thd['本组坏客户'].cumsum()
score_thd['剔除坏客户%'] = score_thd['剔除坏客户']/score_thd['总坏客户']
tmp = score_thd['本组客户'].copy()
tmp[tmp==0] = 1
score_thd['本组坏客户占比'] = score_thd['本组坏客户']/tmp
score_thd['损失客户中坏客户占比'] = score_thd['剔除坏客户']/score_thd['损失客户']
return score_thd
# --------计算分数阈值表---------------
score_df = df_score001[['score','SeriousDlqin2yrs']]
score_thd = cal_score_threshold_tb(score_df,bin_step=20)
输出如下:
分组名称 | 本组客户 | 本组好客户 | 本组坏客户 | 阈值 | 损失客户 | 损失客户% | 损失好客户 | 损失好客户% | 剔除坏客户 | 剔除坏客户% | 本组坏客户占比 | 损失客户中坏客户占比 | |
0 | [300-320) | 15 | 4 | 11 | <320 | 15 | 0.01% | 4 | 0.00% | 11 | 0.11% | 73.33% | 73.33% |
1 | [320-340) | 163 | 40 | 123 | <340 | 178 | 0.12% | 44 | 0.03% | 134 | 1.36% | 75.46% | 75.28% |
2 | [340-360) | 257 |