#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2022/3/4 19:53
# @Author : Feng Zhanpeng
# @File : step11_calculate_fun.py
# @Software: PyCharm
"""
单维度策略测算部分指标计算相关函数,直接加载即可
"""
import re
'''
1. 变量描述性统计分析函数
'''
def describe_stat_ana(describe_data,sample_range,seq,sample_type,ana_people):
"""
:param describe_data: 需要分析的数据框
:param sample_range: 样本区间
:param seq: 分析变量从seq开始计数
:param sample_type: 分析的样本类型,列表格式
:param ana_people: 分析人
:return: 变量的描述性统计分析结果
"""
col_seq = ["序号", "分析时间", "样本类型", "坏客户定义", "变量英文名", "变量中文名", "样本区间","%Bad_Rate(不含缺失值)", "%Bad_Rate(包含缺失值)",
"总样本量","坏样本量", "缺失量", "缺失率","变量取值数(包含缺失值)", "变量取值数(不含缺失值)", "单一值最大占比的变量值", "单一值最大占比的样本量",
"单一值最大占比", "单一值第二大占比的变量值", "单一值第二大占比的样本量","单一值第二大占比", "单一值第三大占比的变量值", "单一值第三大占比的样本量",
"单一值第三大占比","单一值前二大占比的总样本量", "单一值前二大占比总和", "单一值前三大占比的总样本量", "单一值前三大占比总和",
"最大值", "最大值数量", "最大值占比", "最小值", "最小值数量", "最小值占比","平均值", "下四分位数", "中位数", "上四分位数",
"标准差", "离散系数", "标签1"]
var_detail = pd.DataFrame(columns=col_seq)
for sample_type_sub in sample_type:
print('正在分析的样本类型为:', sample_type_sub)
if '其他' in sample_type_sub:
describedata = describe_data[describe_data[sample_type_col[sample_type_sub][0]].map(
lambda x: str(x) not in sample_type_col[sample_type_sub][1])]
elif 'Total' in sample_type_sub:
describedata = describe_data
else:
describedata = describe_data[describe_data[sample_type_col[sample_type_sub][0]].map(
lambda x: str(x) in sample_type_col[sample_type_sub][1])]
target = sample_type_target[sample_type_sub]
for target_sub in target:
print("分析的目标字段为:", target_sub)
mydata1 = describedata[describedata[target_ripe[target_sub][0]]==1] ##获取成熟样本
mydata1 = mydata1.drop(labels=target_del_col[target_sub], axis=1)
for var in mydata1.columns[:-1]:
print('正在分析的变量为:', var)
seq1 = ana_people + str(seq)
ana_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
sample_des = sample_type_sub
var_english_name = var
var_chinese_name = var_dict.变量描述[var_dict.变量名 == var].values[0] if sum(var_dict.变量名 == var) else var
sample_range = sample_range
bad_define = target_sub
# 计算badrate,包含和不包含缺失值
data_nona = mydata1[[var, target_sub]].dropna()
data_withna = mydata1[[var, target_sub]]
bad_rate_nona = data_nona[target_sub].value_counts(normalize=True)[1] if \
len(data_nona[target_sub].value_counts(normalize=True)) > 1 else 0
bad_rate_withna = data_withna[target_sub].value_counts(normalize=True)[1]
total_cnt = len(data_withna)
bad_cnt =sum(mydata1[target_sub]==1)
na_cnt = sum(mydata1[var].isnull())
na_rate = na_cnt * 1.0 / total_cnt
unique_cnt_withna = len(data_withna[var].unique())
unique_cnt_nona = len(data_nona[var].unique())
####前3大占比值分析
first_info_rate = data_withna[var].value_counts(dropna=False, normalize=True).sort_values(
ascending=False)
first_info_cnt = data_withna[var].value_counts(dropna=False).sort_values(ascending=False)
max_cnt_value = first_info_rate.index[0]
max_cnt_value_num = first_info_cnt.tolist()[0]
max_cnt_value_rate = first_info_rate.max()
second_cnt_value = first_info_rate.index[1] if len(first_info_rate) > 1 else np.nan
second_cnt_value_num = first_info_cnt.tolist()[1] if len(first_info_cnt) > 1 else np.nan
second_cnt_value_rate = first_info_rate.tolist()[1] if len(first_info_rate) > 1 else np.nan
second_cnt_value_rate_01 = first_info_rate.tolist()[1] if len(first_info_rate) > 1 else np.nan
third_cnt_value = first_info_rate.index[2] if len(first_info_rate) > 2 else np.nan
third_cnt_value_num = first_info_cnt.tolist()[2] if len(first_info_cnt) > 2 else np.nan
third_cnt_value_rate = first_info_rate.tolist()[2] if len(first_info_rate) > 2 else np.nan
third_cnt_value_rate_01 = first_info_rate.tolist()[2] if len(first_info_rate) > 2 else np.nan
var_max_cnt_value_12num = np.nansum([max_cnt_value_num, second_cnt_value_num])
var_max_cnt_value_12rate = np.nansum([first_info_rate.tolist()[0], second_cnt_value_rate_01])
var_max_cnt_value_123num = np.nansum([max_cnt_value_num, second_cnt_value_num, third_cnt_value_num])
var_max_cnt_value_123rate = np.nansum(
[first_info_rate.tolist()[0], second_cnt_value_rate_01, third_cnt_value_rate_01])
max_value = data_nona[var].max()
max_value_num = sum(data_nona[var] == max_value)
max_value_rate = sum(data_nona[var] == max_value) / total_cnt
min_value = data_nona[var].min()
min_value_num = sum(data_nona[var] == min_value)
min_value_rate = sum(data_nona[var] == min_value) / total_cnt
mean_value = data_nona[var].mean()
q1_value = np.percentile(data_nona[var], 25) if len(data_nona) > 0 else np.nan
median_value = data_nona[var].median()
q3_value = np.percentile(data_nona[var], 75) if len(data_nona) > 0 else np.nan
std_value = np.std(data_nona[var])
cv = std_value / mean_value if mean_value != 0 else 0
top_1_2_rate = np.nansum([first_info_rate.tolist()[0], second_cnt_value_rate_01])
if (top_1_2_rate < 0.999) & (unique_cnt_withna >= 2):
if_trigger_choose = 'Y'
else:
if_trigger_choose = 'N'
sum_info = [seq1, ana_time, sample_des, bad_define, var_english_name, var_chinese_name, sample_range,
bad_rate_nona, bad_rate_withna, total_cnt, bad_cnt, na_cnt, na_rate, unique_cnt_withna,
unique_cnt_nona,max_cnt_value, max_cnt_value_num,max_cnt_value_rate, second_cnt_value,
second_cnt_value_num, second_cnt_value_rate,third_cnt_value, third_cnt_value_num,
third_cnt_value_rate, var_max_cnt_value_12num,var_max_cnt_value_12rate,var_max_cnt_value_123num,
var_max_cnt_value_123rate, max_value, max_value_num, max_value_rate, min_value,
min_value_num, min_value_rate, mean_value, q1_value, median_value, q3_value,
std_value, cv, if_trigger_choose]
sum_info01 = pd.DataFrame(sum_info).T
sum_info01.columns = col_seq
var_detail = var_detail.append(sum_info01, ignore_index=True)
seq += 1
return (var_detail)
'''
2. 变量分箱函数
'''
# 2.1 变量分箱函数
def get_bin_lift(data,flag_name,factor_name,min_rate=0.003,sub_div_bin=0.05,min_num=30,method='best',numOfSplit=10):
"""
:param data: 需要分析的数据框
:param flag_name: 要分析的目标字段
:param factor_name: 要分析的变量
:param min_rate: 最小分箱占比
:param sub_div_bin: 头部和尾部分箱占比
:param min_num: 最小分箱样本量
:param method: 分箱方法:最优分箱、等频分箱
:param numOfSplit: 等频分箱对应的分箱数量
:return: 分箱结果
"""
k=group_by_var_value(data,flag_name,factor_name,'#Bad','#Good')
k1 = get_na_bin(data, flag_name, factor_name, '#Bad', '#Good')
if len(k)==0:
print(flag_name,factor_name,' have no value')
return pd.DataFrame()
if method == 'best':
total = len(data[data[factor_name].notnull()])
min_rate_act=max(min_num/total,min_rate)
knot_start=[]
obs_start=[]
knot_end = []
obs_end=[]
if sub_div_bin>=min_rate_act:
end_cnt=int(sub_div_bin/min_rate_act)
else:
end_cnt=int(min_rate_act/sub_div_bin)
for i in range(end_cnt):
if len(knot_start) == 0:
tmp=k[k['%Cum_Obs']>=min_rate_act].index.tolist()
else:
tmp=k[k['%Cum_Obs']>=min_rate_act+obs_start[-1]].index.tolist()
if len(tmp) > 0:
knot_start.append(tmp[0])
obs_start.append(k['%Cum_Obs'][tmp[0]])
for i in range(end_cnt):
if len(knot_end) == 0:
tmp=k[k['%Opps_Cum_Obs']>=min_rate_act].index.tolist()
else:
tmp=k[k['%Opps_Cum_Obs']>=min_rate_act+obs_end[-1]].index.tolist()
if len(tmp) > 0:
if tmp[-1]>0:
knot_end.append(tmp[-1]-1)
obs_end.append(k['%Opps_Cum_Obs'][tmp[-1]-1])
knot = sorted(list(set(knot_start + knot_end)))
if len(k) - 1 in knot:
knot.remove(len(k) - 1)
if len(knot) > 15:
knot = knot[0:7] + knot[-7:]
elif method == 'equalfreq':
knot_value = unsupervise_splitbin(data,factor_name,numOfSplit,method)
knot=[k[k[factor_name]==i].index[0] for i in knot_value ]
res1 = important_bin_calculate(k, k1, '#Good', '#Bad', factor_name, [0] + knot + [len(k) - 1])
return res1
# 计算变量每一种取值、每一种指标取值下好样本的个数、坏样本的个数
def group_by_var_value(data, flag_name, factor_name, bad_name, good_name, discrete_list=[]):
"""
:param data: 需要分析的数据框
:param flag_name: 要分析的目标字段
:param factor_name: 要分析的变量
:param bad_name: 坏样本个数列名
:param good_name: 好样本个数列名
:param discrete_list: 分类变量列表
:return: 变量每一种取值、每一种指标取值下好样本的个数、坏样本的个数
"""
if len(data) == 0:
return pd.DataFrame()
regroup1 = data.groupby([factor_name])[flag_name].count()
regroup2 = data.groupby([factor_name])[flag_name].sum()
data1 = pd.DataFrame({good_name: regroup1 - regroup2, bad_name: regroup2}).reset_index()
good = float(sum(data1[good_name]))
bad = float(sum(data1[bad_name]))
total = good + bad
data1['%Bad_Rate'] = data1[bad_name] / (data1[bad_name] + data1[good_name])
data1['#Obs'] = (data1[good_name] + data1[bad_name])
data1['%Obs'] = (data1[good_name] + data1[bad_name]) / total
data1['%Cum_Obs'] = np.cumsum(data1['%Obs'])
data1['%Opps_Cum_Obs'] = (1 - np.cumsum(data1['%Obs'])) + data1['%Obs']
if factor_name not in discrete_list:
data1 = data1.sort_values(by=[factor_name], ascending=True)
data1['Char_Type'] = 'numeric'
else:
data1 = data1.sort_values(by=['%Bad_Rate'], ascending=True)
data1['Char_Type'] = 'non-numeric'
data1 = data1.reset_index(drop=True)
return data1
def get_str(x):
"""
将数字转为字符串
"""
if type(x) in [float, np.float64, np.float16, np.float32]:
return ('{0:.17}'.format(x))
elif type(x) in [int, np.int8, np.int16, np.int32, np.int64]:
return str(x)
else:
try:
return str(x)
except:
return x
# 根据切分点获得指标的分组结果
def important_bin_calculate(data_df,na_df, good_name, bad_name, factor_name, knots_list,if_sort=False):
"""
:param data_df: 转换后的数据框
:param na_df: 要分析的目标字段
:param good_name: 好样本个数列名
:param bad_name: 坏样本个数列名
:param factor_name: 指标列名
:param knots_list: 最佳分组点集合
:param if_sort: 是否需要对指标的分组结果进行排序
:return: 根据切分点获得指标的分组结果
"""
flag = data_df['Char_Type'].max()
temp_df_list = []
bin_list = []
for i in range(1, len(knots_list)):
if i == 1:
temp_df_list.append(data_df.loc[knots_list[i - 1]:knots_list[i]])
if flag == 'numeric':
bin_list.append('(-inf, ' + get_str(data_df[factor_name][knots_list[i]]) + ']')
else:
bin_list.append(list(data_df[factor_name])[knots_list[i - 1]:knots_list[i] + 1])
else:
temp_df_list.append(data_df.loc[knots_list[i - 1] + 1:knots_list[i]])
if flag == 'numeric':
if knots_list[i - 1] + 1 == knots_list[i]:
bin_list.append('[' + get_str(data_df[factor_name][knots_list[i]]) + ']')
elif i == len(knots_list) - 1:
bin_list.append('(' + get_str(data_df[factor_name][knots_list[i - 1]]) + ', inf)')
else:
bin_list.append(
'(' + get_str(
data_df[factor_name][knots_list[i - 1]]) + ', ' + get_str(
data_df[factor_name][knots_list[i]]) + ']')
else:
bin_list.append(list(data_df[factor_name])[knots_list[i - 1] + 1:knots_list[i] + 1])
if len(knots_list) == 2:
bin_list = ['(-inf, inf)']
if len(na_df) != 0:
na_good = sum(na_df[good_name])
na_bad = sum(na_df[bad_name])
total_good = sum(data_df[good_name]) + na_good
total_bad = sum(data_df[bad_name]) + na_bad
temp_df_list.append(na_df)
bin_list.append("缺失值")
else:
na_good = 0
na_bad = 0
total_good = sum(data_df[good_name])
total_bad = sum(data_df[bad_name])
good_list = list(map(lambda x: sum(x[good_name]), temp_df_list))
bad_list = list(map(lambda x: sum(x[bad_name]), temp_df_list))
good_percent_series = pd.Series(list(map(lambda x: float(sum(x[good_name])) / total_good, temp_df_list)))
bad_percent_series = pd.Series(list(map(lambda x: float(sum(x[bad_name])) / total_bad, temp_df_list)))
woe_list = list(np.log(good_percent_series / bad_percent_series))
IV_list = list((good_percent_series - bad_percent_series) * np.log(good_percent_series / bad_percent_series))
total_list = list(map(lambda x: sum(x[good_name]) + sum(x[bad_name]), temp_df_list))
bin_rate_list = list(
map(lambda x: float(sum(x[good_name]) + sum(x[bad_name])) / (total_good + total_bad), temp_df_list))
non_na_indicator = pd.DataFrame({'Bin': bin_list,
'#Obs': total_list,
'#Good': good_list,
'#Bad': bad_list,
'IV(bin)': IV_list,
'WOE': woe_list,
'%Obs': bin_rate_list})
l = ['Bin', '#Obs', '%Obs', '#Cum_Obs', '%Cum_Obs', '#Good', '%Good', '#Cum_Good', '%Cum_Good',
'#Bad', '%Bad', '#Cum_Bad', '%Cum_Bad', '%Bad_Rate', 'WOE', 'IV(bin)', 'IV(total)', 'Odds1', 'Odds2',
'Lift']
result_indicator = non_na_indicator.reset_index(drop=True)
result_indicator = result_indicator[result_indicator['Bin'] != 'NA']
if if_sort:
result_indicator = result_indicator.sort_index(ascending=False).reset_index()
result_indicator['%Cumulative_Bad_Rate'] = np.cumsum(result_indicator['#Bad']) / np.cumsum(
result_indicator['#Obs'])
result_indicator['WOE'] = result_indicator['WOE'].map(lambda x: 0 if x in [np.inf, -np.inf] else x)
result_indicator['IV(bin)'] = result_indicator['IV(bin)'].map(lambda x: 0 if x in [np.inf, -np.inf] else x)
result_indicator['#Cum_Obs'] = np.cumsum(result_indicator['#Obs'])
result_indicator['%Cum_Obs'] = np.cumsum(result_indicator['%Obs'])
result_indicator['%Good'] = result_indicator['#Good'] / sum(result_indicator['#Good'])
result_indicator['#Cum_Good'] = np.cumsum(result_indicator['#Good'])
result_indicator['%Cum_Good'] = result_indicator['#Cum_Good'] / sum(result_indicator['#Good'])
result_indicator['%Bad'] = result_indicator['#Bad'] / sum(result_indicator['#Bad'])
result_indicator['#Cum_Bad'] = np.cumsum(result_indicator['#Bad'])
result_indicator['%Cum_Bad'] = result_indicator['#Cum_Bad'] / sum(result_indicator['#Bad'])
result_indicator['%Bad_Rate'] = result_indicator['#Bad'] / result_indicator['#Obs']
result_indicator['IV(total)'] = np.cumsum(result_indicator['IV(bin)'])
if len(na_df) != 0:
result_indicator_1=result_indicator[result_indicator['Bin']!='缺失值']
result_indicator['Odds1']=((result_indicator_1['#Cum_Bad'] / result_indicator_1['#Cum_Good']) / (
(total_bad -na_bad- result_indicator_1['#Cum_Bad']) / (total_good -na_good- result_indicator_1['#Cum_Good']))).tolist()+[np.nan]
result_indicator['Odds2']=[np.nan] +list(1/result_indicator['Odds1'])[0:-2]+[np.nan]
else:
result_indicator['Odds1'] = (result_indicator['#Cum_Bad'] / result_indicator['#Cum_Good']) / (
(total_bad - result_indicator['#Cum_Bad']) / (total_good - result_indicator['#Cum_Good']))
result_indicator['Odds2'] = [np.nan] + list(1 / result_indicator['Odds1'])[0:-1]
result_indicator['Lift'] = result_indicator['%Bad_Rate'] / (sum(result_indicator['#Bad']) / (sum(result_indicator['#Obs'])))
result_indicator = result_indicator.replace(np.inf, 0)
return result_indicator[l]
# 获取缺失值分箱
def get_na_bin(data_total, flag_name, factor_name, good_name, bad_name):
"""
:param data_total: 全量数据框
:param flag_name: 要分析的目标字段
:param factor_name: 要分析的变量列名
:param good_name: 好样本个数列名
:param bad_name: 坏样本个数指标列名
:return: 缺失值分箱
"""
data = data_total[(data_total[factor_name].isnull())]
good_cnt = data[flag_name].sum()
tn = len(data[data[flag_name].notnull()])
na_df = pd.DataFrame([["缺失值", good_cnt, tn - good_cnt]],columns=[factor_name, good_name, bad_name])
return na_df
# 无监督分箱
def unsupervise_splitbin(df,var,numOfSplit, method = 'equalfreq'):
'''
:param df: 要分箱数据框
:param var: 需要分箱的变量。仅限数值型。
:param numOfSplit: 需要分箱个数
:param method: 分箱方法,'equal freq':等频,否则是等距
:return: 分箱索引或分箱临界点
'''
df=df[~df[var].isnull()]
if method == 'equalfreq':
N = df.shape[0]
n = np.int(N / numOfSplit)
splitPointIndex = [i * n for i in range(1, numOfSplit)]
rawValues = sorted(list(df[var]))
splitPoint = [rawValues[i] for i in splitPointIndex]
splitPoint = sorted(list(set(splitPoint)))
return splitPoint
if method =='equallen':
var_max, var_min = max(df[var]), min(df[var])
interval_len = (var_max - var_min)*1.0/numOfSplit
splitPoint = [var_min + i*interval_len for i in range(1,numOfSplit)]
return splitPoint
'''
3. 变量效果分析和筛选
'''
# 变量最佳切分点确定&变量效果分析和筛选 根据指标分组结果筛选出最优的odds和对应阈值
def select_best_lift(data,flag_name,factor_name,min_rate=0.003,sub_div_bin=0.05,min_num=30,hit_num=30,
min_lift=1.5,method='best',numOfSplit=10):
"""
:param data: 要分析的数据框
:param flag_name: 要分析的目标字段
:param factor_name: 指标列名
:param min_rate: 头部和尾部最小分箱占比
:param sub_div_bin: 头部和尾部分箱占比
:param min_num: 最小分箱样本量
:param hit_num: 最小分箱触碰样本量
:param min_lift: 最小lift值
:param method: 分箱方法
:param numOfSplit: 等频分箱对应的分箱数量
:return: 策略阈值及对应的触碰量、风险倍数、是否筛选等指标
"""
bin_odd = get_bin_lift(data, flag_name, factor_name, min_rate, sub_div_bin, min_num,method,numOfSplit)
odd1 = bin_odd[bin_odd['%Cum_Obs'] < sub_div_bin]
sp = list(bin_odd[bin_odd['Bin'] == '缺失值']['%Obs'].to_dict().values())[0]
odd2 = bin_odd.loc[bin_odd[bin_odd['%Cum_Obs'] >= 1 - sp - sub_div_bin].index[1:]]
dic={'var_name':factor_name}
max1 = odd1['Odds1'].max()
max2 = odd2['Odds2'].max()
if (np.isnan(max1)) and (np.isnan(max2)):
print(factor_name+' 未分箱成功或者非缺失值分箱中无坏样本,请关注')
return dic
elif (np.isnan(max1)) or (max1<=max2):
max2=odd2['Odds2'].max()
max_index=odd2[odd2['Odds2'] == max2].index[0]
dic['Odds'] = max2
bin_detail=bin_odd['Bin'][max_index]
if ',' not in bin_detail:
max_index2 = max_index-1
bin_detail = bin_odd['Bin'][max_index2]
print(bin_detail)
if ',' not in bin_detail:
dic['Threshold'] = float(bin_detail.split(' ')[-1][1:-1])
else:
dic['Threshold']=float(bin_detail.split(' ')[-1][0:-1])
print(dic['Threshold'])
if len(str(dic['Threshold']))==0:
dic['Threshold'] = float(bin_odd['Bin'][max_index2].split(' ')[-1][1:-1])
else:
dic['Threshold'] = float(bin_detail.split(',')[0][1:])
print(dic)
dic['type'] = '>'
bin_odd_1=bin_odd[bin_odd['Bin'] != '缺失值']
dic['#Obs'] = bin_odd_1['#Obs'][max_index:].sum()
dic['%Obs'] = bin_odd_1['%Obs'][max_index:].sum()
dic['#Bad'] = bin_odd_1['#Bad'][max_index:].sum()
dic['%Bad_Rate'] = dic['#Bad'] / dic['#Obs']
dic['Lift'] = dic['%Bad_Rate']/(bin_odd['#Bad'].sum()/bin_odd['#Obs'].sum()) if (bin_odd['#Bad'].sum()/bin_odd['#Obs'].sum())>0 else 0
dic['Selected'] = 'Y' if (dic['Lift'] >= min_lift and dic['#Obs']>=hit_num) else 'N'
else:
max1 = odd1['Odds1'].max()
max_index = odd1[odd1['Odds1'] == max1].index[0]
dic['Odds'] = max1
bin_detail = bin_odd['Bin'][max_index]
if ',' not in bin_detail:
dic['Threshold'] = float(bin_detail.split(' ')[-1][1:-1])
else:
dic['Threshold'] = float(bin_detail.split(',')[1][:-1])
dic['type'] = '<='
dic['#Obs'] = bin_odd['#Obs'][0:(max_index + 1)].sum()
dic['%Obs'] = bin_odd['%Obs'][0:(max_index + 1)].sum()
dic['#Bad'] = bin_odd['#Bad'][0:(max_index + 1)].sum()
dic['%Bad_Rate'] = dic['#Bad'] / dic['#Obs']
dic['Lift'] = dic['%Bad_Rate']/(bin_odd['#Bad'].sum()/bin_odd['#Obs'].sum()) if (bin_odd['#Bad'].sum()/bin_odd['#Obs'].sum())>0 else 0
dic['Selected'] = 'Y' if (dic['Lift'] >= min_lift and dic['#Obs']>=hit_num) else 'N'
return dic
'''
4. 分析结果汇总、整合函数
'''
# 规则效果分析和统计
def bin_result_summary(bindata,bin_summary,sample_type,sub_div_bin,min_num,hit_num,method,numOfSplit):
'''
:param bindata: 传入需要分箱的数据
:param bin_summary: 详见函数 bin_result_summary_final
:param sample_type: 样本类型 可同时分析不同样本类型的数据
:param sub_div_bin: 头部和尾部样本占比
:param min_num: 最小分箱数量
:param method: 分箱方法
:param numOfSplit: 分箱数量
:return:
'''
for sample_type_sub in sample_type:
print('正在分析的样本类型为:', sample_type_sub)
if '其他' in sample_type_sub:
describedata = bindata[bindata[sample_type_col[sample_type_sub][0]].map(
lambda x: str(x) not in sample_type_col[sample_type_sub][1])]
elif 'Total' in sample_type_sub:
describedata = bindata
else:
describedata = bindata[bindata[sample_type_col[sample_type_sub][0]].map(
lambda x: str(x) in sample_type_col[sample_type_sub][1])]
target = sample_type_target[sample_type_sub]
for target_sub in target:
print("分析的目标字段为:", target_sub)
mydata1 = describedata[describedata[target_ripe[target_sub][0]] == 1] ##获取成熟样本
mydata1 = mydata1.drop(labels=target_del_col[target_sub], axis=1)
min_rate=target_min_rate[target_sub][0]
print(sample_type_sub, target_sub, '每箱最小占比:', min_rate, '数据量:', len(mydata1))
for var in mydata1.columns[:-1]:
print('正在分析变量:', var)
try:
sample_bin = select_best_lift(data=mydata1, flag_name=target_sub, factor_name=var,
min_rate=min_rate, sub_div_bin=sub_div_bin, min_num=min_num,hit_num=hit_num,
min_lift=sample_type_lift[sample_type_sub][target_sub],method=method,numOfSplit=numOfSplit)
if len(sample_bin) <= 2:
cnt = mydata1[var].count()
data1 = mydata1[[var, target_sub]].dropna()
bad = sum(data1[target_sub] == 1)
badrate = bad / cnt
sample_bin = {'#Bad': bad, '#Obs': cnt, '%Bad_Rate': badrate, '%Obs': 1, 'Odds': 1,
'Lift': 1, 'Threshold': 'NaN', 'type': 'NaN', 'var_name': var,'Selected':'N'}
except:
cnt = mydata1[var].count()
data1 = mydata1[[var, target_sub]].dropna()
bad = sum(data1[target_sub] == 1)
badrate = bad / cnt
sample_bin = {'#Bad': bad, '#Obs': cnt, '%Bad_Rate': badrate, '%Obs': 1, 'Odds': 1,
'Lift': 1, 'Threshold': 'NaN', 'type': 'NaN', 'var_name': var,'Selected':'N'}
sample_bin = pd.DataFrame.from_dict(sample_bin, orient='index').T
sample_bin['样本类型'] = sample_type_sub
sample_bin['坏客户定义'] = target_sub
sample_bin = sample_bin[
['var_name', '样本类型', '坏客户定义', 'Threshold', 'type', '#Obs', '%Obs', '#Bad', '%Bad_Rate', 'Odds',
'Lift','Selected']]
sample_bin.rename(columns={'%Bad_Rate': '%Bin_Bad_Rate', 'var_name': '变量英文名','Selected':'标签2'}, inplace=True)
bin_summary = bin_summary.append(sample_bin)
return bin_summary
# 规则效果分析和统计最终结果
def bin_result_summary_final(hit_num,bindata,var_select01,sub_div_bin,min_num,sample_type,method,numOfSplit):
'''
:param hit_num: 规则最小触碰量
:param bindata: 传入需要分箱的数据
:param var_select01: 变量描述性统计分析结果
:param sub_div_bin: 头部和尾部样本占比
:param min_num: 每个分箱的最小数量
:param sample_type: 样本类型
:param method: 分箱方法
:param numOfSplit: 分箱个数
:return: 最终分箱结果汇总
'''
bin_summary=pd.DataFrame(columns=['变量英文名', '样本类型', '坏客户定义', 'Threshold', 'type', '#Obs', '%Obs',
'#Bad', '%Bin_Bad_Rate', 'Odds','Lift','标签2'])
bin_summary_01=bin_result_summary(bindata=bindata,bin_summary=bin_summary,sample_type=sample_type,
sub_div_bin=sub_div_bin,min_num=min_num,hit_num=hit_num,method=method,numOfSplit=numOfSplit)
bin_summary_select_var=['序号','分析时间','样本类型','坏客户定义','变量英文名','变量中文名',
'样本区间','%Bad_Rate(不含缺失值)','%Bad_Rate(包含缺失值)','标签1']
bin_summary_02=pd.merge(var_select01[bin_summary_select_var],bin_summary_01,on=['变量英文名','样本类型','坏客户定义'],how='left')
####添加标签3,此处标签3的结果等于标签2的结果
filter2 = bin_summary_02
filter2['标签3'] = 'N'
pattern = re.compile(r'\d{1,2}')
for sampletype in filter2.样本类型.unique():
for target_ls in filter2.坏客户定义.unique():
filter2_ls = filter2[(filter2.样本类型 == sampletype) & (filter2.坏客户定义 == target_ls)]
for varname in set(filter2_ls.变量中文名[(filter2_ls.标签1 == 'Y') & (filter2_ls.标签2 == 'Y')]):
data1 = filter2_ls[(filter2_ls.标签1 == 'Y') & (filter2_ls.标签2 == 'Y') & (filter2_ls.变量中文名 == varname)]
if len(data1) == 1:
var = data1.变量英文名[data1.Odds == data1.Odds.max()].values[0]
filter2['标签3'][(filter2.标签1 == 'Y') & (filter2.标签2 == 'Y') & (filter2.变量英文名 == var) & (
filter2.坏客户定义 == target_ls)] = 'Y'
if len(data1) > 1:
var1 = data1.变量英文名[data1.Odds == data1.Odds.max()].values[0]
filter2['标签3'][(filter2.标签1 == 'Y') & (filter2.标签2 == 'Y') & (filter2.变量英文名 == var1) & (
filter2.坏客户定义 == target_ls)] = 'Y'
filter2 = filter2[['序号', '分析时间', '样本类型', '坏客户定义', '变量英文名', '变量中文名',
'样本区间', '%Bad_Rate(不含缺失值)', '%Bad_Rate(包含缺失值)', 'Threshold', 'type', '#Obs', '%Obs',
'#Bad', '%Bin_Bad_Rate', 'Odds', 'Lift', '标签1', '标签2', '标签3']]
return filter2
# 分箱结果明细
def bin_result_detail(bindata,var_select02,sample_type,sub_div_bin,min_num,method,numOfSplit):
'''
:param my_data: 需要分析的数据
:param var_select02: filter2
:param sample_type: 样本类型
:param sub_div_bin: 头部和尾部分箱占比
:param min_num: 每个分箱的最小数量
:param method: best 根据业务逻辑分箱;equalfre 等频率分箱;equallen等宽分箱
:param numOfSplit: 等宽或者等频分箱数
:param return: 分箱结果明细
'''
bin_summary_details=pd.DataFrame(columns=['序号','分析时间','样本类型','坏客户定义','变量英文名','变量中文名','样本区间','标签1', '标签2', '标签3', 'Bin', '#Obs', '%Obs','#Cum_Obs',
'%Cum_Obs', '#Good', '%Good', '#Cum_Good', '%Cum_Good', '#Bad', '%Bad', '#Cum_Bad',
'%Cum_Bad','%Bad_Rate', 'WOE', 'IV(bin)', 'IV(total)', 'Odds1', 'Odds2','Lift'])
for sample_type_sub in sample_type:
print('正在分析的样本类型为:', sample_type_sub)
if '其他' in sample_type_sub:
describedata = bindata[bindata[sample_type_col[sample_type_sub][0]].map(
lambda x: str(x) not in sample_type_col[sample_type_sub][1])]
elif 'Total' in sample_type_sub:
describedata = bindata
else:
describedata = bindata[bindata[sample_type_col[sample_type_sub][0]].map(
lambda x: str(x) in sample_type_col[sample_type_sub][1])]
target = sample_type_target[sample_type_sub]
for target_sub in target:
mydata1 = describedata[describedata[target_ripe[target_sub][0]] == 1] ##获取成熟样本
mydata1 = mydata1.drop(labels=target_del_col[target_sub], axis=1)
min_rate = target_min_rate[target_sub][0]
print(sample_type_sub, target_sub, '每箱最小占比:', min_rate, '数据量:', len(mydata1))
for var in mydata1.columns[:-1]:
print('正在分析变量:', var)
try:
sample_bin = get_bin_lift(data=mydata1, flag_name=target_sub, factor_name=var,
min_rate=min_rate, sub_div_bin=sub_div_bin, min_num=min_num,
method=method,numOfSplit=numOfSplit)
except:
value = ['(-inf,inf)', len(mydata1), 1, len(mydata1), 1, sum(mydata1[target_sub] == 0), 1,
sum(mydata1[target_sub] == 0), 1,
sum(mydata1[target_sub] == 1), 1, sum(mydata1[target_sub] == 1), 1,
sum(mydata1[target_sub] == 1) / len(mydata1),
np.NaN, np.NaN, np.NaN, 1, 1, 1]
sample_bin = pd.DataFrame([value],
columns=['Bin', '#Obs', '%Obs', '#Cum_Obs', '%Cum_Obs', '#Good', '%Good',
'#Cum_Good', '%Cum_Good', '#Bad', '%Bad', '#Cum_Bad', '%Cum_Bad',
'%Bad_Rate', 'WOE', 'IV(bin)', 'IV(total)', 'Odds1', 'Odds2',
'Lift'])
if len(sample_bin) <= 1:
value = ['(-inf,inf)', len(mydata1), 1, len(mydata1), 1, sum(mydata1[target_sub] == 0), 1,
sum(mydata1[target_sub] == 0), 1,
sum(mydata1[target_sub] == 1), 1, sum(mydata1[target_sub] == 1), 1,
sum(mydata1[target_sub] == 1) / len(mydata1),np.NaN, np.NaN, np.NaN, 1, 1, 1]
sample_bin = pd.DataFrame([value],
columns=['Bin', '#Obs', '%Obs', '#Cum_Obs', '%Cum_Obs', '#Good', '%Good',
'#Cum_Good', '%Cum_Good', '#Bad', '%Bad', '#Cum_Bad', '%Cum_Bad',
'%Bad_Rate', 'WOE', 'IV(bin)', 'IV(total)', 'Odds1', 'Odds2',
'Lift'])
sample_bin['变量英文名'] = var
sample_bin['样本类型'] = sample_type_sub
sample_bin['坏客户定义'] = target_sub
var_msg = var_select02.loc[(var_select02.样本类型 == sample_type_sub) & (var_select02.变量英文名 == var) & (
var_select02.坏客户定义 == target_sub),['序号', '分析时间', '样本类型', '坏客户定义', '变量英文名', '变量中文名',
'样本区间', '标签1', '标签2', '标签3']]
merge = pd.merge(var_msg, sample_bin, on=['变量英文名', '样本类型', '坏客户定义'], how='left')
bin_summary_details = bin_summary_details.append(merge)
return bin_summary_details
'''
4. 变量分析步骤说明及分析结果统计
'''
def get_summary(filter2):
'''
:param filter2: 变量效果分析和筛选的数据
:return: 泛化变量汇总信息
'''
aa=filter2
##此方法要保证变量类型唯一 ,否则的话 summary_info=aa[['变量开发','信息大类','信息类型','变量类型']].drop_duplicates().reset_index()
summary_info=aa[['样本类型','坏客户定义']].drop_duplicates().reset_index()
summary_info.drop(labels='index',inplace=True,axis=1)
var_num=[]
var_num_label1=[]
var_num_label2=[]
var_num_label3=[]
for row in summary_info.iterrows():
data=aa[(aa.样本类型==row[1][0]) &(aa.坏客户定义==row[1][1])]
var_num.append(len(data))
data1=aa[(aa.样本类型==row[1][0]) &(aa.坏客户定义==row[1][1]) &(aa.标签1=='Y')]
var_num_label1.append(len(data1))
data2 = aa[(aa.样本类型==row[1][0]) &(aa.坏客户定义==row[1][1])&(aa.标签1 == 'Y') & (aa.标签2 == 'Y')]
var_num_label2.append(len(data2))
data3 = aa[(aa.样本类型==row[1][0]) &(aa.坏客户定义==row[1][1])& (aa.标签1 == 'Y') & (aa.标签2 == 'Y') &(aa.标签3 == 'Y')]
var_num_label3.append(len(data3))
summary_info['变量总数'] = var_num
summary_info['标签1筛选变量数'] = var_num_label1
summary_info['标签1剔除变量数'] = summary_info['变量总数'] - summary_info['标签1筛选变量数']
summary_info['标签2筛选变量数'] = var_num_label2
summary_info['标签2剔除变量数'] = summary_info['标签1筛选变量数'] - summary_info['标签2筛选变量数']
summary_info['标签3筛选变量数'] = var_num_label3
summary_info['标签3剔除变量数'] = summary_info['标签2筛选变量数'] - summary_info['标签3筛选变量数']
summary_info['剩余变量占比'] = summary_info['标签3筛选变量数'] / summary_info['变量总数']
###最终汇总结果输出
summary_info = summary_info[['样本类型','坏客户定义', '变量总数',
'标签1剔除变量数', '标签1筛选变量数', '标签2剔除变量数', '标签2筛选变量数',
'标签3剔除变量数', '标签3筛选变量数', '剩余变量占比']]
summary_info = summary_info.reset_index()
del summary_info['index']
sp = summary_info.sum().tolist()[2:-1]
k1 = ['总计'] + ['']+sp + [sp[-1] / sp[0] if sp[0] > 0 else ['总计'] + sp + [0]]
summary_info.loc[len(summary_info)] = k1
return summary_info
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2022/3/4 20:21
# @Author : Feng Zhanpeng
# @File : step12_output_fun.py
# @Software: PyCharm
"""
单维度策略测算部分计算结果输出相关函数,直接加载即可
"""
import copy
'''
xlsxwriter格式设置和配色设置
'''
# 配色
biaotou='#366092'
text='#F4F4F4'
title='#44546A'
title_sub1='#00868B'
title_sub2='#00E5EE'
# 通用搭配
split_color='#FFFFFF' #白色
xunhuan1='#D1D1D1' #银色
xunhuan2='#E3E3E3' #银色
title_size=12
biaotou_size=10
text_size=8
# 条件格式: 蓝色数据条,不指定最大、最小值,绿色
condition_format_green_no = {'type': 'data_bar','bar_solid': True,'data_bar_2010': True,
'bar_color': '#65d97d'}
# 条件格式: 黄色数据条,不指定最大、最小值,橙色
condition_format_yellow_no = {'type': 'data_bar','bar_solid': True,'data_bar_2010': True,
'bar_color': '#f2572d'}
# 条件格式: 黄色数据条,不指定最大、最小值,橙色
condition_format_pink_no = {'type': 'data_bar','bar_solid': True,'data_bar_2010': True,
'bar_color': '#FF69B4'}
condition_format_blue_no = {'type': 'data_bar','bar_solid': True,'data_bar_2010': True,
'bar_color': '#1E90FF'}
# 总标题格式
title_format = {'bold': True,'font_name': 'Arial','font_size': title_size,'font_color': 'white',
'top_color': biaotou,'bottom_color': biaotou,'left_color': biaotou,
'right_color': biaotou,'bg_color': biaotou}
# 副标题格式
subtitle_format={'border': True,'font_size': biaotou_size,'font_name': 'Arial',
'top_color':title,'font_color': 'white',
'bottom_color': title,'bold': True,
'left_color': split_color,
'right_color': split_color,
'bg_color': title,
'align': 'left',
'valign': 'vcenter'}
# 表格正文
body_text_format_01={'border': True,'font_size': text_size,'font_name': 'Arial',
'top_color':xunhuan1,
'bottom_color': xunhuan1,
'left_color': split_color,
'right_color': split_color,
'bg_color': xunhuan1,
'align': 'left',
'valign': 'vcenter'}
# 正文为百分比
body_text_per_format_01=copy.deepcopy(body_text_format_01)
body_text_per_format_01['num_format']='0.00%'
# 表格正文
body_text_format_02={'border': True,'font_size': text_size,'font_name': 'Arial', #border:边框
'top_color':xunhuan2,
'bottom_color': xunhuan2,
'left_color': split_color,
'right_color':split_color,
'bg_color': xunhuan2,
'align': 'left',
'valign': 'vcenter'}
# 正文为百分比
body_text_per_format_02=copy.deepcopy(body_text_format_02)
body_text_per_format_02['num_format']='0.00%'
# 表格正文
body_text_format_03 = {'bold': True,'font_name': 'Arial','font_size':text_size,'font_color': 'black'}
'''
标准化结果输出函数
'''
# 自动获取单元格内容的length,如果包含汉字,则一个汉字计数为2,如果是英文,则计数为1
def get_same_len(x):
"""
:param x: 输入内容
:return: 字符串长度
"""
import re
if type(x)!=str:
x=str(x)
l=list(x)
num=0
for i in l:
if re.match("[\u4e00-\u9fa5]+",i):
num=num+2
else:
num=num+1
return num
'''
变量分箱结果明细输出函数
'''
def details_result_output(wb,sheetname,data,suoyin,ana_people):
'''
:param wb: excel 文件
:param sheetname: sheetname
:param data: 待输出数据
:param suoyin: 序号所在的列
:param ana_people: 分析人
:return:
'''
nrows, ncols = data.shape
body_text_xunhuan1 = wb.add_format(body_text_format_01)
body_text_xunhuan1_per = wb.add_format(body_text_per_format_01)
body_text_xunhuan2 = wb.add_format(body_text_format_02)
body_text_xunhuan2_per = wb.add_format(body_text_per_format_02)
body_text_title = wb.add_format(subtitle_format)
ws = wb.add_worksheet(sheetname)
ws.freeze_panes(1, 4) ## 冻结单元格
ws.autofilter(0,0,nrows,ncols-1)
ws.hide_gridlines({'option': 1})
column = data.columns
for i in range(len(column)):
x = column[[i]][0]
ll = get_same_len(x)
lll = max(8, ll)
ws.set_column(i , i , lll)
data = data.replace(np.inf, 'inf')
data = data.fillna('')
data = data.replace(-np.inf, '-inf')
for j in range(ncols):
ws.write(0, j, column[j], body_text_title)
for i in range(nrows):
for j in range(ncols):
if 'pass1' in column[j]:
ws.conditional_format(1, j, nrows, j, condition_format_green_no)
elif '%Bad_Rate' in column[j] or 'Lift' in column[j]:
ws.conditional_format(1, j, nrows, j, condition_format_pink_no)
elif 'pass2' in column[j]:
ws.conditional_format(1, j, nrows, j, condition_format_red_no)
elif 'pass3' in column[j]:
ws.conditional_format(1, j, nrows, j, condition_format_blue_no)
value = data.iloc[i][j]
key = int(data.iloc[i, suoyin].replace(ana_people, ''))
if key % 2 == 1:
if ('%' in column[j] or '率' in column[j] or 'Rate' in column[j] or column[j] in ['单一值最大占比', '单一值第二大占比',
'单一值第三大占比',
'单一值前二大占比总和',
'单一值前三大占比总和']):
ws.write(i + 1, j, value, body_text_xunhuan1_per)
else:
ws.write(i + 1, j, value, body_text_xunhuan1)
else:
if ('%' in column[j] or '率' in column[j] or 'Rate' in column[j] or column[j] in ['单一值最大占比', '单一值第二大占比',
'单一值第三大占比',
'单一值前二大占比总和',
'单一值前三大占比总和']):
ws.write(i + 1, j, value, body_text_xunhuan2_per)
else:
ws.write(i + 1, j, value, body_text_xunhuan2)
'''
分析结果汇总输出函数
'''
def summary_result_output(wb, sheetname, data):
'''
:param wb: excel 文件
:param sheetname: sheetname
:param data: 待输出数据
:return:
'''
nrows, ncols = data.shape
body_text_xunhuan1 = wb.add_format(body_text_format_01)
body_text_xunhuan1_per = wb.add_format(body_text_per_format_01)
body_text_xunhuan2 = wb.add_format(body_text_format_02)
body_text_xunhuan2_per = wb.add_format(body_text_per_format_02)
body_text_title = wb.add_format(subtitle_format)
ws = wb.add_worksheet(sheetname)
ws.freeze_panes(1, 4) ## 冻结单元格
ws.autofilter(0, 0, nrows, ncols - 1)
ws.hide_gridlines({'option': 1})
column = data.columns
for i in range(len(column)):
x = column[[i]][0]
ll = get_same_len(x)
lll = max(8, ll)
ws.set_column(i, i, lll)
data = data.replace(np.nan, '')
data = data.replace(np.inf, 'Inf')
for j in range(ncols):
ws.write(0, j, column[j], body_text_title)
for i in range(nrows):
for j in range(ncols):
value = data.iloc[i][j]
if i % 2 == 1:
if ('%' in column[j] or '率' in column[j] or column[j] in ['单一值最大占比', '单一值第二大占比',
'单一值第三大占比',
'单一值前二大占比总和', '单一值前三大占比总和']):
ws.write(i + 1, j, value, body_text_xunhuan1_per)
else:
ws.write(i + 1, j, value, body_text_xunhuan1)
else:
if ('%' in column[j] or '率' in column[j] or column[j] in ['单一值最大占比', '单一值第二大占比',
'单一值第三大占比',
'单一值前二大占比总和', '单一值前三大占比总和']):
ws.write(i + 1, j, value, body_text_xunhuan2_per)
else:
ws.write(i + 1, j, value, body_text_xunhuan2)
'''
第4步:策略测算过程及变量筛选情况
'''
def var_summary_result_output(wb,sheetname,data,start=0):
'''
:param wb: excel 文件
:param sheetname: sheetname
:param data: 待输出数据
:param start: 开始进行输出的表格行
:return:
'''
ws = wb.add_worksheet(sheetname)
column = data.columns
nrows, ncols = data.shape
for i in range(ncols):
ws.set_column(i+1, i+1, 10)
body_text_title = wb.add_format(subtitle_format)
# sheet第四行,写入总标题
body_title = wb.add_format(title_format)
# 表格正文: 边框白色,字体12,背景浅灰色、居中
body_text_xunhuan2 = wb.add_format(body_text_format_02)
body_text_xunhuan2_per = wb.add_format(body_text_per_format_02)
body_text_red = wb.add_format(subtitle_format)
body_text_red.set_font_color('red')
body_text_pink = wb.add_format(body_text_format_03)
body_text_pink.set_bg_color('#F2DCE5')
body_text_pink.set_text_wrap('True')
body_text_pink.set_align('vcenter')
body_text_blue = wb.add_format(body_text_format_03)
body_text_blue.set_bg_color('#DAEEF3')
body_text_red_xifen = wb.add_format(subtitle_format)
body_text_red_xifen.set_font_color('red')
ws.hide_gridlines({'option': 1})
ws.merge_range(1 + start, 1, 1 + start, ncols, '一、' + sheetname, body_title)
ws.set_column(0, 0, 2)
ws.set_row(start + 3, 130)
ws.set_row(start + 5, 100)
ws.set_row(start + 7, 67)
ws.set_row(start + 9, 26)
ws.set_row(start + 10, 7)
remark1 = '1.变量基础分析和筛选'
remark2 = '''
(1) 分析维度
样本量、缺失量、缺失率、Badrate、单一值最大占比的变量值、单一值最大占比的样本量、单一值最大占比、单一值第二大占比的变量值、
单一值第二大占比的样本量、单一值第二大占比、单一值第三大占比的变量值、单一值第三大占比的样本量、单一值第三大占比、
单一值前二大占比的总样本量、单一值前二大占比总和、单一值前三大占比的总样本量、单一值前三大占比总和、变量取值数(包含缺失值)、
变量取值数(不含缺失值)、最小值、最大值、平均值、分位数、标准差、离散系数
(2) 筛选标准
a.单一值最大占比 < 99%
b.变量取值数(不含缺失值) >=2
筛选结果详见标签1
'''
remark3 = '2.变量效果分析和筛选'
remark4 = '''
(1) 对变量进行分箱,计算不同分箱的触碰量、触碰率、Odds、Lift等指标
(2) 基于头部和尾部分箱结果对变量进行筛选
a.最小触碰量 >= 30 (备注:大于等于某一阈值,默认30)
b.触碰率 <= 5% (备注:小于等于某一阈值,默认5%)
c.Lift >= 3 (备注:大于等于某一阈值,默认3)
筛选结果详见标签2
'''
remark5 = '3.变量相关性分析和筛选'
remark6 = '''
筛选标准
a.对标签2筛选的变量进行两两线性相关分析,若相关性较强,选取Lift值大的变量
b.选取有明确业务含义的变量
筛选结果详见标签3
'''
remark7 = '4.变量分析结果汇总'
sk0 = data['变量总数'][nrows - 1]
sk1 = data['标签1筛选变量数'][nrows - 1]
sk2 = data['标签2筛选变量数'][nrows - 1]
sk3 = data['标签3筛选变量数'][nrows - 1]
sk5 = float(data['剩余变量占比'][nrows - 1])
remark8 = ''' 分析的变量数总计为%s个,标签1筛选剩余%s个,标签2筛选剩余%s个,标签3筛选剩余%s个,最终筛选剩余变量占比为%.2f''' % (
sk0, sk1, sk2, sk3, sk5 * 100) + '%'
ws.merge_range(start + 2, 1, start + 2, ncols, remark1, body_text_blue)
ws.merge_range(start + 3, 1, start + 3, ncols, remark2, body_text_pink)
ws.merge_range(start + 4, 1, start + 4, ncols, remark3, body_text_blue)
ws.merge_range(start + 5, 1, start + 5, ncols, remark4, body_text_pink)
ws.merge_range(start + 6, 1, start + 6, ncols, remark5, body_text_blue)
ws.merge_range(start + 7, 1, start + 7, ncols, remark6, body_text_pink)
ws.merge_range(start + 8, 1, start + 8, ncols, remark7, body_text_blue)
ws.merge_range(start + 9, 1, start + 9, ncols, remark8, body_text_pink)
ws.merge_range(start + 10, 1, start + 10, ncols, '', body_text_pink)
add = 8
data = data.replace(np.inf, 'inf')
data = data.fillna('')
data = data.replace(-np.inf, '-inf')
body_text_title.set_text_wrap('True')
for j in range(ncols):
ws.write(start + 3 + add, j + 1, column[j], body_text_title)
ws.autofilter(start + 3 + add, 1, start + 3 + add, ncols)
for i in range(nrows):
for j in range(ncols):
value = data.iloc[i][j]
if ('%' in column[j] or '占比' in column[j] or 'rate' in column[j]):
ws.write(i + start + 4 + add, j + 1, value, body_text_xunhuan2_per)
else:
ws.write(i + start + 4 + add, j + 1, value, body_text_xunhuan2)
'''
第6步:最终筛选变量结果输出
'''
def var_summary_result_output_01(wb,sheetname,data):
'''
:param wb: excel 文件
:param sheetname: sheetname
:param data: 待输出数据
:return:
'''
nrows, ncols = data.shape
body_text_xunhuan2 = wb.add_format(body_text_format_02)
body_text_xunhuan2_per = wb.add_format(body_text_per_format_02)
body_text_title = wb.add_format(subtitle_format)
ws = wb.add_worksheet(sheetname)
ws.freeze_panes(1, 4) ## 冻结单元格
ws.autofilter(0,0,nrows,ncols-1)
ws.hide_gridlines({'option': 1})
column = data.columns
for i in range(len(column)):
x = column[[i]][0]
ll = get_same_len(x)
lll = max(8, ll)
ws.set_column(i , i , lll)
data = data.replace(np.nan, '')
data = data.replace(np.inf, 'Inf')
for j in range(ncols):
ws.write(0 , j , column[j], body_text_title)
for i in range(nrows):
for j in range(ncols):
value = data.iloc[i][j]
if ('%' in column[j] or '占比' in column[j] or 'rate' in column[j] ):
ws.write( i+1, j , value, body_text_xunhuan2_per)
else:
ws.write(i + 1, j, value, body_text_xunhuan2)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2022/3/4 20:28
# @Author : Feng Zhanpeng
# @File : step13_rule_calculate.py
# @Software: PyCharm
"""
单维度策略测算代码执行顺序:
1. 加载策略测算过程中需要使用的功能函数,因函数内容较多,具体详见本书附件step11_calculate_fun.py;
2. 加载基于xlsxwriter包的分析结果自动化输出函数,因函数内容较多,具体详见本书附件step12_output_fun.py;
3. 加载Pyhon包
4. 配置相关分析文档路径及分析结果输出文档名称
5. 读入数据并进行数据预处理;
6. 配置函数所需的各种参数;
7. 基于加载的函数进行描述性统计分析、变量分箱、规则效果分析和筛选、变量分析和筛选结果汇总;
8. 基于xlsxwriter包自动化输出策略测算结果;
9. 基于xlsxwriter包自动化输出待泛化的策略;
"""
# 3. 加载Python包
import pandas as pd
import numpy as np
import datetime
import re
import os
import xlsxwriter
# 4. 配置相关分析文档路径及分析结果输出文档名称
# 数据存储路径,在实操时要换成自己本地的路径
path='F:\\DataAna\\策略\\Chapter2\\2.2.7授信审批场景单维度策略开发\\'
# 分析结果输出路径
path_result=path+'\\rule_result\\'
if not os.path.exists(path_result):
os.makedirs(path_result)
# 加载策略测算生成的Excel文件名
starttime = datetime.datetime.now()
print('程序开始运行时间为:'+ str(starttime))
excel_name='Var_ana_result_'+starttime.strftime('%Y%m%d%H')+'.xlsx'
# 5.读入数据并进行数据预处理
# 读取变量数据字典
var_dict = pd.read_excel(path + "待分析变量的数据字典.xlsx")
var_dict.变量名 = var_dict.变量名.map(lambda x: str(x).lower().replace('\t', ''))
# 加载数据
f=open(path+'rule_data.csv',encoding='utf-8')
my_data = pd.read_csv(f)
my_data=my_data[(my_data['apply_mth'].map(lambda x : (x>='2021-08' and x<='2021-10' )))& (my_data['if_loan_in_30']==1)]
my_data.columns=[x.lower() for x in my_data.columns]
# 删除不需要分析的字段
my_data.drop(labels=['apply_day', 'apply_week', 'apply_mth', 'apply_refuse_flag',
'apply_id','product_name','agr_fpd_15', 'fpd_15_act', 'if_loan_flag',
'if_loan_in_30'], axis=1, inplace=True)
# 处理缺失值:变量中取值为-999,-9999,-999999代表该取值缺失
for i in my_data.columns[my_data.dtypes!= 'object']:
my_data[i][my_data[i].map(lambda x : x in (-999,-9999,-999999))] = np.nan
# 处理分类型变量
for i in my_data.columns[my_data.dtypes== 'object']:
my_data[i]=my_data[i].map(lambda x: str(x).strip())
my_data[i][my_data[i].map(lambda x :x in ['-999','-9999','-999999'])]= np.nan
try:
my_data[i]=my_data[i].astype('float64')
except:
del my_data[i]
# 处理灰样本
my_data['mob3_dpd_30_act']=my_data['mob3_dpd_30_act'].map(lambda x:1 if x==1 else 0)
my_data1=my_data
'''
6.配置函数所需的各种参数
seq :变量计数开始序号,若有10个变量,计算结果为seq,seq+1,...,seq+9;
sample_type : 测算样本类型,本节中的代码支持同时测算多个不同类型的样本,如可同时对不同产品样本进行策略挖掘;
sample_type_col : 若测算样本是全量样本(Total),则不需要配置该参数,否则需配置该参数,指名样本类型从哪个字段获取如何获取;
sample_type_target : 测算不同样本类型时对应要分析的目标字段;
target_ripe : 获取目标字段对应的是否成熟标签;
target_del_col : 筛选完样本后,最终应只剩下待测算变量和目标字段,目标字段在最后一列。该参数表示删除不需要用到的字段;
sub_div_bin :头部和尾部需要精细化分析的样本占比(比如5%,表示首尾5%的样本需要精细化分箱分析);
min_num : 每箱最小样本数;
target_min_rate : 不同目标字段对应的分箱中,每箱最小占比;
sample_type_lift : 规则阈值确定后,筛选的样本要满足的最小lift,不同目标字段,可设置不同的lift,基于lift衡量规则效果;
hit_num : 虽然设置了min_num参数,但是因为数据分布不均匀,筛选出来的分箱样本量可能会小于min_num。虽然最终的分箱满足其他预设的各种条件,如Lift表现较好,但是若样本量太少不满足大数定律也是不建议作为规则使用的,hit_num参数就是在min_num参数的基础上,对筛选规则触碰样本进行强制限制,小于该值不筛选该规则
'''
# 参数值设置
sample_range = '202108-202110'
seq = 1
# sample_type取值只有Total,表示测算全量样本,不需要配置参数sample_type_col;若还有其他样本类型,也支持同时测算
sample_type = ['Total']
# 若sample_type需测算类型1和类型2的样本,类型1和类型2所在的字段为ordertype,sample_type_col参数示例如下
# sample_type_col = {
# '类型1': ['ordertype', ['类型1']],
# '类型2': ['ordertype', ['类型2']]
# }
sample_type_target = {'Total': ['fpd_30_act','mob3_dpd_30_act']}
target_ripe = {'fpd_30_act':['agr_fpd_30'],'mob3_dpd_30_act':['agr_mob3_dpd_30']}
target_del_col = {'fpd_30_act': ['agr_fpd_30','agr_mob3_dpd_30','mob3_dpd_30_act'],
'mob3_dpd_30_act':['agr_mob3_dpd_30','agr_fpd_30','fpd_30_act']}
sub_div_bin = 0.1
target_min_rate = {'fpd_30_act':[0.015],'mob3_dpd_30_act':[0.02]}
min_num = 40
hit_num=30
sample_type_lift = { 'Total': {'fpd_30_act':1.8,'mob3_dpd_30_act':3}}
# 7.基于加载的函数进行描述性统计分析、变量分箱、规则效果分析和筛选、分析结果和待泛化策略自动化输出
# 变量描述性统计分析,ana_people表示策略分析人是谁
var_select_01=describe_stat_ana(describe_data=my_data1,sample_range=sample_range ,seq=seq,sample_type=sample_type,ana_people='fzp')
# 策略测算效果分析和筛选
filter2=bin_result_summary_final(hit_num=hit_num,bindata=my_data1,var_select01=var_select_01,sub_div_bin=sub_div_bin,min_num=min_num,sample_type=sample_type,method='best',numOfSplit=25)
# 获取变量分箱结果明细
bins_result_detail=bin_result_detail(bindata=my_data1,var_select02=filter2,sample_type=sample_type,sub_div_bin=sub_div_bin,min_num=min_num,method='best',numOfSplit=25)
# 变量分析和筛选情况
summary_info=get_summary(filter2)
# 8.基于xlsxwriter包自动化输出策略测算结果
wb = xlsxwriter.Workbook(path_result+excel_name)
var_summary_result_output(wb=wb,sheetname='变量筛选汇总',data=summary_info,start=0)
summary_result_output(wb=wb,sheetname='1.变量基础分析和筛选',data=var_select_01)
details_result_output(wb=wb,sheetname='2.变量分箱',data=bins_result_detail, suoyin=0, ana_people='fzp')
# 此处因分析的变量较少,未对筛选出来的规则使用的变量进行相关性分析和筛选
summary_result_output(wb=wb,sheetname='3.变量效果分析和筛选',data=filter2)
wb.close()
# 9.基于xlsxwriter包自动化输出待泛化的策略
var_summary = filter2
# 筛选标签3为Y的规则进行泛化
var_summary01=var_summary[var_summary.标签3=='Y']
var_summary02=var_summary01[['序号','分析时间','样本类型','样本区间','坏客户定义', '变量英文名', '变量中文名','type','Threshold','%Bad_Rate(包含缺失值)', '#Obs','%Obs', '#Bad', '%Bin_Bad_Rate','Odds','Lift']]
var_summary02.rename(columns={'序号':'Seq','分析时间':'Ana_Date','样本类型':'Rule_Limit','样本区间':'Sample_Range','坏客户定义':'Target', '变量英文名':'Var', '变量中文名':'Description','type':'Direction','%Bad_Rate(包含缺失值)':'%Bad_Rate'},inplace=True)
varChinese=[]
for i,j in zip(var_summary02.loc[:,'Var'],var_summary02.loc[:,'Description']):
print(i,j)
try:
pipei = re.findall('[0-9]+.{1,3}', i)[-1:][0]
suoyin = re.search(pipei, i).start()
pipei = i[suoyin:]
varChinese.append(j.replace('XX时间',pipei))
except:
varChinese.append(j)
var_summary02['Description']=varChinese
var_summary02['Rule_Name']=['single_var_'+ i for i in var_summary02.loc[:,'Seq'] ]
var_summary02['Rule_Category']='单变量规则'
var_summary02['Rule_Type']=var_summary02['Target'].map(lambda x:'FR' if x=='fpd_30_act' else 'CR')
var_summary02=var_summary02[['Ana_Date','Seq','Sample_Range','Rule_Name','Rule_Category','Rule_Type','Rule_Limit','Target','Var','Description','Direction','Threshold','%Bad_Rate', '#Obs', '%Obs','#Bad', '%Bin_Bad_Rate', 'Odds','Lift']]
var_summary02['id']=var_summary02.Seq.map(lambda x: int(x.replace('fzp','')))
var_summary02.sort_values(by='id',inplace=True)
del var_summary02['id']
# 待泛化规则自动化输出
wb = xlsxwriter.Workbook(path_result+'规则字典.xlsx')
var_summary_result_output_01(wb=wb,sheetname='规则字典',data=var_summary02)
wb.close()
路径为E:\Programs\新建文件夹\
最新发布