参考:https://blog.youkuaiyun.com/weixin_41761857/article/details/82026170
数值型变量
def NumVarTracker(df, col, target, filepath,truncation):
# ‘‘‘
# :param df: the dataset containing numerical independent variable and dependent variable样本集
# :param col: independent variable with numerical type变量
# :param target: dependent variable, class of 0-1目标
# :param filepath: the location where we save the histogram图片存储路径
# :param truncation: indication whether we need to do some truncation for outliers判断是否需要剔除离群值
# :return: the descriptive statistics
# ’’’
#数据初步筛除空值(空值具有自己不与自己相等的属性)
# extract target variable and specific indepedent variable
validDf = df.loc[df[col] == df[col]][[col,target]]
#统计非空值占比:
validRcd = validDf.shape[0]*1.0/df.shape[0]
validRcdFmt = "%.2f%%"%(validRcd*100)
#数值型数据统计性描述:
descStats = validDf[col].describe()
mu = "%.2e" % descStats['mean']
std = "%.2e" % descStats['std']
maxVal = "%.2e" % descStats['max']
minVal = "%.2e" % descStats['min']
#各变量用户流失分布情况:
x = validDf.loc[validDf[target]==1][col]
y = validDf.loc[validDf[target]==0][col]
xweights = 100.0 * np.ones_like(x) / x.size
yweights = 100.0 * np.ones_like(y) / y.size
#判断是否需要剔除离群值(离群值基于0.95分位点进行剔除):
if truncation == True:
pcnt95 = np.percentile(validDf[col],95)
x = x.map(lambda x: min(x,pcnt95))
y = y.map(lambda x: min(x,pcnt95))
#数据可视化探索数据:
fig, ax = plt.subplots()
ax.hist(x, weights=xweights, alpha=0.5,label='bad',color='red')#直方图
ax.hist(y, weights=yweights, alpha=0.5,label='good')#直方图
titleText = 'Histogram of '+ col +'\nvalid_pcnt='+validRcdFmt+',Mean ='+mu+\
',Std='+std
#变量指标
ax.set(title = titleText, ylabel = '% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.legend(loc='upper right')
figSavePath = filepath+'\\'+str(col)+'.png'
plt.savefig(figSavePath)
# plt.close(1)#画图后图片不打开
字符型变量
def CharVarPerf(df,col,target,filepath):
'''
:param df: the dataset containing numerical independent variable and dependent variable
:param col: independent variable with numerical type
:param target: dependent variable, class of 0-1
:param filepath: the location where we save the histogram
:return: the descriptive statistics
'''
validDf = df.loc[df[col] == df[col]][[col, target]]
#统计非空值占比:
cnt = list(df[col].isnull()).count(False)
validRcd = validDf.shape[0]*1.0/df.shape[0]
recdNum = validDf.shape[0]
validRcdFmt = "%.2f%%"%(validRcd*100)
#统计类别变量的各个值的分布:
freqDict = {}
badRateDict = {}
#for each category in the categorical variable, we count the percentage and churn rate(每个值的比例与对应的流失比例)
for v in set(validDf[col]):
#统计不同值的个数/collections的Counter函数可计算变量与变量的个数
vDf = validDf.loc[validDf[col] == v]
freqDict[v] = vDf.shape[0]*1.0/recdNum #Dataframe
badRateDict[v] = sum(vDf[target])*1.0/vDf.shape[0]
descStats=pd.DataFrame({'percent':freqDict,'badrate':badRateDict})
fig = plt.figure() # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes子图
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
plt.title('The percentage and badRate for '+col+'\n valid pcnt ='+validRcdFmt)
descStats['badrate'].plot(kind='line', color='red', ax=ax)
descStats.percent.plot(kind='bar', color='blue', ax=ax2, width=0.2,position = 1)
ax.set_ylabel('badrate')
ax2.set_ylabel('percentage')
figSavePath = filepath+'\\'+str(col)+'.png'
plt.savefig(figSavePath)