通过matplotlib分析LR

最新推荐文章于 2025-09-15 09:06:34 发布

原创最新推荐文章于 2025-09-15 09:06:34 发布 · 1.1k 阅读

0 ·

CC 4.0 BY-SA版权

机器学习专栏收录该内容

6 篇文章

订阅专栏

Lr的学习除了理论学习外，还有一些具象化的dd会理解比较深入。本文实现了3D的效果图，回想起来还是http://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic.html#sphx-glr-auto-examples-linear-model-plot-logistic-py 的解释会更容易理解一些

当前内容没有完全说清楚，后续会找时间补齐

以下代码可以分析以下问题：

1.对于一维特征，其参数对logit的影响：

参见 draw_logit() 函数，参数为0时，lr模型就是一条水平线。随着参数值越来越大，logit曲线越来越陡峭。对正反例集合的区分度也就越高

2.为什么要求特征分布单调（伴随值越大，label=单一值的占比越高），因为lr曲线本身只能切一刀（与y=1，y=0两条曲线只有2个焦点，而正态分布需要4个点），对正态分布是无法提供良好支持的，反而效果会变弱

3.为什么woe的方式会让效果变好：因为对特征的分布进行了调整，让特征分布变为递增

4.为什么分桶会效果会变好：这个跟是不是lr没关系。。。。

以下是后续计划分析的：

5.梯度下降与牛顿方法，迭代过程中，会怎么影响logit曲线的形态：后续补充

6.sklearn的调参会怎样影响LR模型：后续分析

7.如何确定特征有效，或可以进一步深挖

8.woe与oneHot方法对于LR哪一个更好

9.如何评价最佳分桶？

#!/usr/bin/env python
# encoding: utf-8

import numpy as np
import matplotlib.pyplot as plt
import math
import random
def get_x2_by_x1(x1,type):
    x2=0
    if type=='=':
        x2=x1
    if type=='sqrt':
        x2=x1*x1
    if type=='circle':
        r=random.random()
        if r<0.5:
            x2=math.sqrt(1-x1*x1)-random.random()
        else:
            x2 = -(math.sqrt(1 - x1 * x1)-random.random())

    return x2

def get_cirle(offset_x=0,offset_y=0,_step=500):
    x = np.arange(-1, +1, 1.0 / _step);
    x2 = [get_x2_by_x1(k, 'circle')+offset_y for k in x]
    x1=[k+offset_x for k in x]
    return x1,x2


def draw_defalut_circle(_each_sample_num):
    import numpy as np
    each_sample_num=_each_sample_num
    fig, ax = plt.subplots()
    tx1=[]
    tx2=[]
    x1, x2 = get_cirle(0, 3,each_sample_num)
    tx1+=x1
    tx2 += x2
    ax.scatter(x1, x2)

    x1, x2 = get_cirle(2, 1,each_sample_num)
    tx1 += x1
    tx2 += x2
    ax.scatter(x1, x2,c='r', marker='x')  #, s=10)

    #print np.array([tx1, tx2]).reshape(2, each_sample_num)
    f=np.c_[tx1,tx2]
    label=[0]*each_sample_num*2+[1]*each_sample_num*2

    "draw logit regress result"
    from sklearn import datasets
    from sklearn.model_selection import cross_val_predict
    from sklearn import linear_model
    lr = linear_model.LinearRegression()
    print f.shape,len(label)
    lr.fit(f, label)
    print('Coefficients: \n', lr.coef_)
    x1p=lr.coef_[0]
    x2p=lr.coef_[1]

    def get_y(f,coef_):

        lines_num=f.shape[0]
        col_num=f.shape[1]
        y=[]
        for i in range(0,lines_num):
            k=0
            for j in range(0,col_num):
                k+=f[i,j]*coef_[j]
            y.append(1.0 /(1+math.exp(-1*k)))
        return y

    #y=[1.0 /(1+math.exp(-1*(x1p * k))) for k in f[:,0]]
    y=get_y(f,lr.coef_)
    #ax.scatter(f[:,0], y, c='g', marker='o')  # , s=10)
    #ax.scatter(f[:,1], y, c='g', marker='o')  # , s=10)

    #y=[1.0 /(1+math.exp(-1*x2p * k)) for k in f[:,1]]
    #ax.scatter(f[:,1], y, c='g', marker='o')  # , s=10)
    plt.show()



def draw_logit(_each_sample_num):
    import numpy as np
    each_sample_num=_each_sample_num
    fig, ax = plt.subplots()

    def set_a(ax,_a,c='y',marker='x'):
        a = _a
        x = np.arange(-2, 2, 1.0 / 500);
        y = [1.0 / (1 + math.exp(-1 * a * k)) - 0.5 for k in x]
        ax.scatter(x, y, c=c, marker=marker)
    set_a(ax,-2)

    a = -1
    x = np.arange(-2, 2, 1.0 / 500);
    y = [1.0 / (1 + math.exp(-1 * a * k)) - 0.5 for k in x]
    ax.scatter(x, y,c='y', marker='x')

    a = 0
    x = np.arange(-2, 2, 1.0 / 500);
    y = [1.0 / (1 + math.exp(-1 * a * k)) - 0.5 for k in x]
    ax.scatter(x, y)

    a=1
    x = np.arange(-2, 2, 1.0 / 500);
    y = [1.0 /(1+math.exp(-1*a * k))-0.5 for k in x]
    ax.scatter(x, y)

    a = 2
    y = [1.0 / (1 + math.exp(-1 * (a * k)))-0.5 for k in x]
    ax.scatter(x, y,c='r', marker='x')


    a = 4
    y = [1.0 / (1 + math.exp(-1 * (a * k)))-0.5 for k in x]
    ax.scatter(x, y,c='r', marker='x')

    a = 8
    y = [1.0 / (1 + math.exp(-1 * (a * k)))-0.5 for k in x]
    ax.scatter(x, y,c='r', marker='x')
    plt.show()

def draw_defalut_circle_3D(_each_sample_num):
    import numpy as np
    from mpl_toolkits.mplot3d import Axes3D

    each_sample_num=_each_sample_num
    #fig, ax = plt.subplots()
    fig = plt.figure()
    ax = Axes3D(fig)

    tx1=[]
    tx2=[]
    x1, x2 = get_cirle(0, 0,each_sample_num)
    tx1+=x1
    tx2 += x2
    ax.scatter(x1, x2,[0]*each_sample_num*2)

    x1, x2 = get_cirle(2, 2,each_sample_num)
    tx1 += x1
    tx2 += x2
    ax.scatter(x1, x2,[1]*each_sample_num*2,c='r', marker='x')

    f = np.c_[tx1, tx2]
    label = [0] * each_sample_num*2 + [1] * each_sample_num*2

    #print len(f[:,0]),len(f[:,1]),len(label)
    #ax.scatter(f[:,0], f[:,1],label,c='g', marker='o')  #, s=10)

    #print np.array([tx1, tx2]).reshape(2, each_sample_num)

    "draw logit regress result"
    from sklearn import datasets
    from sklearn.model_selection import cross_val_predict
    from sklearn import linear_model
    lr = linear_model.LinearRegression()
    print f.shape,len(label)
    lr.fit(f, label)
    print('Coefficients: \n', lr.coef_)
    x1p=lr.coef_[0]
    x2p=lr.coef_[1]

    def get_y(f,coef_):

        lines_num=f.shape[0]
        col_num=f.shape[1]
        y=[]
        for i in range(0,lines_num):
            k=0
            for j in range(0,col_num):
                k+=f[i,j]*coef_[j]
            #k=(k-0.5)*2+0.5
            y.append(1.0 /(1+math.exp(-1*k)))
        return y

    #y=[1.0 /(1+math.exp(-1*(x1p * k))) for k in f[:,0]]
    y=get_y(f,lr.coef_)
    print y

    ax.scatter(f[:,0], f[:, 1], y, c='g', marker='o')  # , s=10)
    plt.show()

def draw_splashes():
        print 'draw'
        plt.show()
#draw_splashes()
#draw_defalut_circle(500)
draw_logit(50)
#draw_defalut_circle_3D(5)