[bigdata-127] 评分卡模型和ipython nootbook

1. 官网
  https://ipython.org/
  ipython,在交互计算能提高效率。交互计算,需要导入不同的包,每一步可能需要看结果,然后决定下一步做什么。有时候需要检查不同函数效果,不同参数效果,输出中间值。这些在pycharm里做会比较麻烦,需要写print输出,或者debug,但无论print还是debug都不如ipython方便。
  当调整好ipython上的程序后,可以再写成正式代码运行。

2. 安装
  2.1 如果安装anacode,会自带ipython和jupyter
  2.2 pip3 install jupyter

3.运行
  3.1 在命令行执行"ipython",启动了命令行的交互环境
  3.2 ipython在4.0版本之后,只聚焦于交互式python内核,其他部分包括notebook,通信协议,qtconsolenotebook web应用等等使用新项目名jupyter
    3.2.1 "jupyter console" 启动命令行,效果等同于3.1
    3.2.2 "jupyter notebook",会在浏览器启动页面,打开一个新的web界面。
  3.3 建议优先使用jupyter notebook

4.文档
  4.1 juptyter文档
    https://jupyter.readthedocs.io/en/latest/tryjupyter.html
  4.2 有用介绍
    https://blog.youkuaiyun.com/qq_37423198/article/details/76180905
    https://www.jianshu.com/p/63175f02749b

5. 第一个例子
  在ipython nootbook新建一个目录,在目录下创建新文件,注意,选择anacode notebook。
  

6. 信用卡评分

6.1 参考文档
  https://www.jianshu.com/p/f931a4df202c
  https://www.jianshu.com/p/159f381c661d

6.2 py代码

{截至到目前为止,本代码是中文世界里的最佳评分卡模型入门实践,完整,简单,注释充分!:)}

#!/usr/bin/env python

import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
import scipy.stats.stats as stats
import statsmodels.api as sm
import seaborn as sns
from sklearn.metrics import roc_curve, auc

def calc_woe_iv(d2, Y, X):
    # 计算分箱后的各统计值
    # 每个分箱的属性最小值。这个最小值可以用来为未知样本进行分箱。
    d3 = pd.DataFrame({'min': d2.min().X})
    # 每个分箱的好样本数量
    d3['sum'] = d2.sum().Y
    # 每个分箱的样本总数
    d3['total'] = d2.count().Y
    ##计算woe: Y值是目标值,好样本的目标值是1,坏样本的目标值是0。因此,Y.sum()就表示好样本的数量,Y.count()是样本总量。只有两种样本
    # 训练集全部好样本总数
    n_good = Y.sum()
    # 训练集全部坏样本总数
    n_bad = Y.count() - n_good
    # 求每个分箱的woe
    d3['woe'] = np.log(
        ((d3['sum'] / d3['total']) / (1 - d3['sum'] / d3['total'])) /
        (n_good / n_bad)
    )
    # 根据min列进行重排,以备输出
    d4 = (d3.sort_values(by='min')).reset_index(drop=True)

    # 计算iv用:每个分箱的好样本占全部好样本的比例
    d3['goodattribute'] = d3['sum'] / n_good
    # 计算iv用:每个分箱的坏样本占全部坏样本的比例
    d3['badattribute'] = (d3['total'] - d3['sum']) / n_bad
    # 计算iv,也就是所有woe根据好坏样本比例求和
    iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum()

    # 把woe转成list,保留小数点后三位
    woe = list(d4['woe'].round(3))

    return d4, iv, woe

#自动分箱
def mono_bin(Y, X, n = 20):
    #相关系数初始化
    r = 0
    #选择最佳分箱数量:从n自减,寻找到最大绝对值相关系数。这是一种经验做法。也可以使用其他分箱方法。
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
        d2 = d1.groupby('Bucket', as_index = True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    #循环完成后,d2就是最佳分箱下的参数

    #计算iv和woe
    d4, iv, woe = calc_woe_iv(d2, Y, X)

    # 计算分位。本例n=3,分箱数是n+1=4。需要计算4个分位。[-inf, 0.25分位],[0.25分位~0.5分位],[0.5分位~0.75分位],[0.75分位, inf]
    # 分位,做predict的时候,对未知样本进行离散化切分
    cut = []
    cut.append(float('-inf'))
    for i in range(1, n + 1):
        qua = X.quantile(i / (n + 1))
        cut.append(round(qua, 4))
    cut.append(float('inf'))

    return d4, iv, cut, woe

#自定义分箱函数
def self_bin(Y, X, cat):
    return calc_woe_iv(pd.DataFrame({'X':X, 'Y':Y, 'Bucket':pd.cut(X, cat)}).groupby('Bucket', as_index = True), Y, X)

#用woe代替:对series的值,判断它在cut里属于哪个区间范围,获得这个区间的序号,然用这个区间对应的woe做替换。
def replace_woe(series, cut, woe):
    list=[]
    i=0
    while i<len(series):
        value=series[i]
        j=len(cut)-2
        m=len(cut)-2
        while j>=0:
            if value>=cut[j]:
                j=-1
            else:
                j -=1
                m -= 1
        list.append(woe[m])
        i += 1
    return list

print("读数据")
data = pd.read_csv("./data-set/give-me-some-credit/cs-training.csv")

#删除第一列:读数据的时候,会把csv的第一列序号读进来,这是不需要的。
#axis=1表示删除列(axis=0表示删除行),inplace表示对data进行删除,执行后,data存储内容改变了。否则需要创建一个新变量保存删除后的结果。
data.drop(["Unnamed: 0"], axis=1, inplace=True)

#SeriousDlqin2yrs是第一列,值为0,表示不逾期,值为1表示逾期。为便于理解,做一次变换:将1设为不逾期,也就是好样本,0设为逾期,也就是坏样本。
data['SeriousDlqin2yrs'] = 1-data['SeriousDlqin2yrs']

#检查哪些属性存在缺失值
print("获取数据的统计信息")
data.describe().to_csv('DataDescribe.csv')

"""
#从DataDescribe.csv能看到,MonthlyIncome和NumberOfDependents的count,都不够15万,都存在数据缺失,因此需要处理。
#在这里,为简单计,删除属性缺失的样本。
#axis=0表示,如果一个行的某个字段缺失数据,那么删除这一行。
"""
print("删除属性缺失样本")
data.dropna(axis=0, inplace=True)
#删除之后行数和列数是120269x11
# print(data.shape)

print('绘制各属性箱线图')
#绘制每个属性的箱线图,从箱线图观察数据是否正常,如果数据异常,可以从图上观察到应该删除哪些样本
#输出所有的列名
# print(data.columns)
#输出结果: ['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
#          'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
#          'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
#          'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
#          'NumberOfDependents']
#第一个属性是被预测的属性,不需要绘制,逐一绘制后面的10个属性
# plt.boxplot(data["RevolvingUtilizationOfUnsecuredLines"])
# plt.boxplot(data["age"])
# plt.boxplot(data["NumberOfTime30-59DaysPastDueNotWorse"])
# plt.boxplot(data["DebtRatio"])
# plt.boxplot(data["MonthlyIncome"])
# plt.boxplot(data["NumberOfOpenCreditLinesAndLoans"])
# plt.boxplot(data["NumberOfTimes90DaysLate"])
# plt.boxplot(data["NumberRealEstateLoansOrLines"])
# plt.boxplot(data["NumberOfTime60-89DaysPastDueNotWorse"])
# plt.boxplot(data["NumberOfDependents"])
# plt.show()

print('根据箱线图删除离群点')
#根据箱线图删除离群点:以NumberOfTime30-59DaysPastDueNotWorse为例删除,其他属性以此类推
data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]

print('把数据切分为训练集和测试集')
#把数据切分成 训练集合和测试集合
#目标值
Y = data['SeriousDlqin2yrs']
#属性值:所有行的除了第一个列数据
X = data.ix[:, 1:]
#测试集占比30%
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
train_set = pd.concat([Y_train, X_train], axis=1)
test_set = pd.concat([Y_test, X_test], axis=1)

print('探索性分析:根据直方图对数据进行进一步约束')
#探索性分析:对训练集的后10个属性绘制直方图,观察数据是否有明显异常
# plt.hist(train_set['RevolvingUtilizationOfUnsecuredLines'], bins= 10)
# plt.hist(train_set['age'], bins= 10)
# plt.hist(train_set['NumberOfTime30-59DaysPastDueNotWorse'], bins= 10)
# plt.hist(train_set['DebtRatio'], bins= 10)
# plt.hist(train_set['MonthlyIncome'], bins= 10)
# plt.hist(train_set['NumberOfOpenCreditLinesAndLoans'], bins= 10)
# plt.hist(train_set['NumberOfTimes90DaysLate'], bins= 10)
# plt.hist(train_set['NumberRealEstateLoansOrLines'], bins= 10)
# plt.hist(train_set['NumberOfTime60-89DaysPastDueNotWorse'], bins= 10)
# plt.hist(train_set['NumberOfDependents'], bins= 10)
# plt.show()

#根据直方图,删除部分明显不合理数据,让直方图看起来正常一点。但这是经验性的,不一定准确。
# train_set = train_set[train_set['RevolvingUtilizationOfUnsecuredLines'] < 10]
# train_set = train_set[train_set['NumberOfTime30-59DaysPastDueNotWorse'] < 6]

print('自动分箱')
dfx1, ivx1, cutx1, woex1=mono_bin(train_set.SeriousDlqin2yrs, train_set.RevolvingUtilizationOfUnsecuredLines,n=10)
dfx2, ivx2, cutx2, woex2=mono_bin(train_set.SeriousDlqin2yrs, train_set.age, n=10)
dfx4, ivx4, cutx4, woex4 =mono_bin(train_set.SeriousDlqin2yrs, train_set.DebtRatio, n=20)
dfx5, ivx5, cutx5, woex5 =mono_bin(train_set.SeriousDlqin2yrs, train_set.MonthlyIncome, n=10)

print('手工分箱')
# 正无穷和负无穷
pinf = float('inf')
ninf = float('-inf')
#对3,6,7,8,9,10,根据观察设定手工分箱参数
cutx3 = [ninf, 0, 1, 3, 5, pinf]
cutx6 = [ninf, 1, 2, 3, 5, pinf]
cutx7 = [ninf, 0, 1, 3, 5, pinf]
cutx8 = [ninf, 0, 1, 2, 3, pinf]
cutx9 = [ninf, 0, 1, 3, pinf]
cutx10 = [ninf, 0, 1, 2, 3, 5, pinf]
dfx3, ivx3, woex3 = self_bin(train_set.SeriousDlqin2yrs, train_set['NumberOfTime30-59DaysPastDueNotWorse'], cutx3)
dfx6, ivx6, woex6 = self_bin(train_set.SeriousDlqin2yrs, train_set['NumberOfOpenCreditLinesAndLoans'], cutx6)
dfx7, ivx7, woex7 = self_bin(train_set.SeriousDlqin2yrs, train_set['NumberOfTimes90DaysLate'], cutx7)
dfx8, ivx8, woex8 = self_bin(train_set.SeriousDlqin2yrs, train_set['NumberRealEstateLoansOrLines'], cutx8)
dfx9, ivx9, woex9 = self_bin(train_set.SeriousDlqin2yrs, train_set['NumberOfTime60-89DaysPastDueNotWorse'], cutx9)
dfx10, ivx10, woex10 = self_bin(train_set.SeriousDlqin2yrs, train_set['NumberOfDependents'], cutx10)


"""
#iv筛选
#绘制iv值:iv值评估一个属性的信息量,一般认为:
#< 0.02: unpredictive
#0.02 to 0.1: weak
#0.1 to 0.3: medium
#0.3 to 0.5: strong
#> 0.5: suspicious
#本例,小于0.1的属性,都会删除,不参与训练,也就是不参与建模。
"""
# ivlist = [ivx1, ivx2, ivx3, ivx4, ivx5, ivx6, ivx7, ivx8, ivx9, ivx10]
# index = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']
# fig1 = plt.figure(1)
# ax1 = fig1.add_subplot(1, 1, 1)
# x = np.arange(len(index)) + 1
# ax1.bar(x, ivlist, width=0.4)
# ax1.set_xticks(x)
# ax1.set_xticklabels(index, rotation=0, fontsize=12)
# ax1.set_ylabel('IV(Information Value)', fontsize=14)
# for a, b in zip(x, ivlist):
#     plt.text(a, b + 0.01, '%.4f' % b, ha='center', va='bottom', fontsize=10)
# plt.show()
# exit(1)

"""
#相关性筛选:如果两个属性的相关性超过某个阈值,则删除iv较低的属性,该属性不参与建模
#计算各变量的相关性系数
#本例,相关性都比较小,不需要去除
"""
# corr = train_set.corr()
# xticks = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']#x轴标签
# yticks = list(corr.index)#y轴标签
# fig = plt.figure()
# ax1 = fig.add_subplot(1, 1, 1)
# sns.heatmap(corr, annot=True, cmap='rainbow', ax=ax1, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'})#绘制相关性系数热力图
# ax1.set_xticklabels(xticks, rotation=0, fontsize=10)
# ax1.set_yticklabels(yticks, rotation=0, fontsize=10)
# plt.show()
# exit(1)


"""
把训练集和测试集的值替换成woe。
分箱操作,会计算出如何对数据进行离散化。
比如,对RevolvingUtilizationOfUnsecuredLines属性进行离散化,是将这个属性转换成4个属性,这4个属性都是二值的,其值是0或1。
这会导致一个问题:属性数量扩大太多了,这会导致训练和预测上的计算复杂度,属性多计算量就大。
将一个属性离散成多个二值属性,等价于给一个属性设置多个值,因为在计算距离上是等价的,关键在于如何选择多值。woe替换就是一种合适的方案。
"""
print("woe替换")
#要做一次reset_inde,让index重新变成[0,1,2...]此前的index经过多次操作已经不连续了,在woe替换的时候会出错
train_set = train_set.reset_index(drop=True, inplace=False)
#开始替换woe
train_set['RevolvingUtilizationOfUnsecuredLines'] = pd.Series(
    replace_woe(train_set['RevolvingUtilizationOfUnsecuredLines'], cutx1, woex1))
train_set['age'] = pd.Series(replace_woe(train_set['age'], cutx2, woex2))
train_set['NumberOfTime30-59DaysPastDueNotWorse'] = pd.Series(
    replace_woe(train_set['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, woex3))
train_set['DebtRatio'] = pd.Series(replace_woe(train_set['DebtRatio'], cutx4, woex4))
train_set['MonthlyIncome'] = pd.Series(replace_woe(train_set['MonthlyIncome'], cutx5, woex5))
train_set['NumberOfOpenCreditLinesAndLoans'] = pd.Series(replace_woe(train_set['NumberOfOpenCreditLinesAndLoans'], cutx6, woex6))
train_set['NumberOfTimes90DaysLate'] = pd.Series(replace_woe(train_set['NumberOfTimes90DaysLate'], cutx7, woex7))
train_set['NumberRealEstateLoansOrLines'] = pd.Series(replace_woe(train_set['NumberRealEstateLoansOrLines'], cutx8, woex8))
train_set['NumberOfTime60-89DaysPastDueNotWorse'] = pd.Series(
    replace_woe(train_set['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, woex9))
train_set['NumberOfDependents'] = pd.Series(replace_woe(train_set['NumberOfDependents'], cutx10, woex10))

#reset index: 切分训练集和测试集之后,它们的index是不连续的,不适合遍历,reset之后,index又变成连续的
test_set = test_set.reset_index(drop=True, inplace=False)
#开始替换woe
test_set['RevolvingUtilizationOfUnsecuredLines'] = pd.Series(
    replace_woe(test_set['RevolvingUtilizationOfUnsecuredLines'], cutx1, woex1))
test_set['age'] = pd.Series(replace_woe(test_set['age'], cutx2, woex2))
test_set['NumberOfTime30-59DaysPastDueNotWorse'] = pd.Series(
    replace_woe(test_set['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, woex3))
test_set['DebtRatio'] = pd.Series(replace_woe(test_set['DebtRatio'], cutx4, woex4))
test_set['MonthlyIncome'] = pd.Series(replace_woe(test_set['MonthlyIncome'], cutx5, woex5))
test_set['NumberOfOpenCreditLinesAndLoans'] = pd.Series(replace_woe(test_set['NumberOfOpenCreditLinesAndLoans'], cutx6, woex6))
test_set['NumberOfTimes90DaysLate'] = pd.Series(replace_woe(test_set['NumberOfTimes90DaysLate'], cutx7, woex7))
test_set['NumberRealEstateLoansOrLines'] = pd.Series(replace_woe(test_set['NumberRealEstateLoansOrLines'], cutx8, woex8))
test_set['NumberOfTime60-89DaysPastDueNotWorse'] = pd.Series(
    replace_woe(test_set['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, woex9))
test_set['NumberOfDependents'] = pd.Series(replace_woe(test_set['NumberOfDependents'], cutx10, woex10))

"""
训练
"""
print("训练")
Y=train_set['SeriousDlqin2yrs']
X=train_set.drop(['SeriousDlqin2yrs','DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
X1=sm.add_constant(X)
logit=sm.Logit(Y,X1)
result=logit.fit()
print(result.params)

print("预测")
Y_test = test_set['SeriousDlqin2yrs']
X_test = test_set.drop(['SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
                    'NumberRealEstateLoansOrLines', 'NumberOfDependents'], axis=1)
X3 = sm.add_constant(X_test)
resu = result.predict(X3)

# print("绘ROC图")
# fpr, tpr, threshold = roc_curve(Y_test, resu)
# rocauc = auc(fpr, tpr)
# plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
# plt.legend(loc='lower right')
# plt.plot([0, 1], [0, 1], 'r--')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.ylabel('真正率')
# plt.xlabel('假正率')
# plt.show()

#评分
coe = result.params
p = 20 / math.log(2)
q = 600 - 20 * math.log(20) / math.log(2)
baseScore = round(q + p * coe[0], 0)
print("baseScore = ", baseScore)

#计算分数函数
def get_score(coe,woe,factor):
    scores=[]
    for w in woe:
        score=round(coe*w*factor,0)
        scores.append(score)
    return scores

def compute_score(series,cut,score):
    list = []
    i = 0
    while i < len(series):
        value = series[i]
        j = len(cut) - 2
        m = len(cut) - 2
        while j >= 0:
            if value >= cut[j]:
                j = -1
            else:
                j -= 1
                m -= 1
        list.append(score[m])
        i += 1
    return list

#每个属性的每个分箱对应的评分分数
x1 = get_score(coe[1], woex1, p)
x2 = get_score(coe[2], woex2, p)
x3 = get_score(coe[3], woex3, p)
x7 = get_score(coe[4], woex7, p)
x9 = get_score(coe[5], woex9, p)

test_set['BaseScore']=pd.Series(np.zeros(len(test_set)))+baseScore
test_set['x1'] = pd.Series(compute_score(test_set['RevolvingUtilizationOfUnsecuredLines'], cutx1, x1))
test_set['x2'] = pd.Series(compute_score(test_set['age'], cutx2, x2))
test_set['x3'] = pd.Series(compute_score(test_set['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3))
test_set['x7'] = pd.Series(compute_score(test_set['NumberOfTimes90DaysLate'], cutx7, x7))
test_set['x9'] = pd.Series(compute_score(test_set['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, x9))
test_set['Score'] = test_set['x1'] + test_set['x2'] + test_set['x3'] + test_set['x7'] +test_set['x9']  + baseScore

print(test_set)

7.关于ks,参考  https://blog.youkuaiyun.com/wendaomudong_l2d4/article/details/72872206

8. 绘制roc曲线,参考 https://www.zhihu.com/question/22844912/answer/246037337

9. roc和ks的差别,参考 http://blog.sina.com.cn/s/blog_13bb711fd0102wqhq.html

10.关于评分,参考 https://blog.youkuaiyun.com/lll1528238733/article/details/76602006


    

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值