阿里云大学笔记——逻辑回归_阿里云做回归检验-优快云博客

本文深入解析逻辑回归算法原理，通过鸢尾花数据集演示了从数据预处理到模型训练的全过程，展示了如何使用自定义逻辑回归类进行二分类预测，并通过可视化结果验证模型的有效性。

0.逻辑回归概念

目的：解决分类问题。
实际：将连续值通过sigmoid函数映射到[0,1]之间，该值为分类为1的概率。p>=0.5判定为类别1，否则判定为类别0。
是广义的线性回归。相当于把线性回归算法的目标函数值映射到[0,1]之间，进行分类。
详细概念

在这里插入图片描述

1. 导入鸢尾花数据集

import numpy as np
import pandas as pd

data=pd.read_csv("data/iris.csv")

data.drop_duplicates(inplace=True)
#Iris-setosa  Iris-versicolor Iris-virginica
data["Name"]=data["Name"].map({"Iris-versicolor":0,"Iris-setosa":1,"Iris-virginica":2})

	SepalLength	SepalWidth	PetalLength	PetalWidth	Name
136	6.3	3.4	5.6	2.4	2
109	7.2	3.6	6.1	2.5	2
13	4.3	3.0	1.1	0.1	1
70	5.9	3.2	4.8	1.8	0
17	5.1	3.5	1.4	0.3	1
4	5.0	3.6	1.4	0.2	1
87	6.3	2.3	4.4	1.3	0
86	6.7	3.1	4.7	1.5	0
67	5.8	2.7	4.1	1.0	0
135	7.7	3.0	6.1	2.3	2
54	6.5	2.8	4.6	1.5	0
83	6.0	2.7	5.1	1.6	0
21	5.1	3.7	1.5	0.4	1
88	5.6	3.0	4.1	1.3	0
22	4.6	3.6	1.0	0.2	1
78	6.0	2.9	4.5	1.5	0
108	6.7	2.5	5.8	1.8	2
118	7.7	2.6	6.9	2.3	2
128	6.4	2.8	5.6	2.1	2
82	5.8	2.7	3.9	1.2	0

#只选取0，1类的鸢尾花数据，进行逻辑回归的二分类。
data=data[data["Name"]!=2]
#len(data)

2. 逻辑回归算法

class LogisticRegression:
    
    def __init__(self,alpha,times):
        
        self.alpha=alpha
        self.times=times
        
    def sigmoid(self,z):
        """
        sigmoid函数的实现。
        paremeters:
        ----
        z：float 自变量，值为：z=w.T*x
        
        returns:
        ----
        p:float，值为[0,1].
          返回样本属于类别1的概率，用来作为结果的预测。
          当s>=0.5(z>=0)时，判定为类别1；否则判定为类别0.
        """
        
        return 1.0/(1.0+np.exp(-z))
    
    def fit(self,X,y):
        
        self.X=np.asarray(X)
        self.y=np.asarray(y)
        #创建权重权重的向量，初试化为0，长度比特征数量多1（多出来的是截距）
        self.w_=np.zeros(1+X.shape[1])
        #创建损失列表，用来保存每次迭代后的损失值
        self.loss_=[]
        
        for i in range(self.times):
            z=np.dot(X,self.w_[1:])+self.w_[0]
            #计算概率值（判定为1的概率）
            p=self.sigmoid(z)
            #根据逻辑回归的目标函数（损失函数）计算损失值
            #损失函数：J(w)=-sum(y*log(s(zi))+(1-yi)*log(1-s(zi)))
            cost=-np.sum(y*np.log(p)+(1-y)*np.log(1-p))
            self.loss_.append(cost)
            
            #调整权重值：权重（j列）=权重（j列）+学习率*sum((y-s(zi))*x(j))
            self.w_[0]+=self.alpha*np.sum(y-p)
            self.w_[1:]+=self.alpha*np.dot(X.T,y-p)
            
    def predict_proba(self,X):
        """返回预测为1和0类型的概率"""
        X=np.asarray(X)
        z=np.dot(X,self.w_[1:])+self.w_[0]
        p=self.sigmoid(z)
        #将数组变为二维数组，便于拼接
        p=p.reshape(-1,1)
        #进行横向拼接
        return np.concatenate([1-p,p],axis=1)
        
    def predict(self,X):
        
        return np.argmax(self.predict_proba(X),axis=1)

3.数据切分

t1=data[data["Name"]==0]
t2=data[data["Name"]==1]
t1=t1.sample(len(t1),random_state=666)
t2=t2.sample(len(t2),random_state=666)

X_train=pd.concat([t1.iloc[:40,:-1],t2.iloc[:40,:-1]],axis=0)
y_train=pd.concat([t1.iloc[:40,-1],t2.iloc[:40,-1]],axis=0)
X_test=pd.concat([t1.iloc[40:,:-1],t2.iloc[40:,:-1]],axis=0)
y_test=pd.concat([t1.iloc[40:,-1],t2.iloc[40:,-1]],axis=0)

4.逻辑回归进行二分类

#鸢尾花的特征属性都是同一个数量级，可以不用进行标准化处理
reg=LogisticRegression(alpha=0.001,times=20)
reg.fit(X_train,y_train)
#预测的概率值
reg.predict_proba(X_test)

array([[0.70002701, 0.29997299],
       [0.83689443, 0.16310557],
       [0.79590141, 0.20409859],
       [0.76391151, 0.23608849],
       [0.7577496 , 0.2422504 ],
       [0.79310386, 0.20689614],
       [0.74918987, 0.25081013],
       [0.75201187, 0.24798813],
       [0.81234813, 0.18765187],
       [0.77947483, 0.22052517],
       [0.30889335, 0.69110665],
       [0.28541285, 0.71458715],
       [0.31146719, 0.68853281],
       [0.29001408, 0.70998592],
       [0.33239525, 0.66760475],
       [0.30123427, 0.69876573],
       [0.28658295, 0.71341705],
       [0.26853876, 0.73146124]])

result=reg.predict(X_test)

np.sum(result==y_test)/len(y_test)

1.0

5. 可视化展示

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams["font.family"]="SimHei"
mpl.rcParams["axes.unicode_minus"]=False

#绘制预测值
plt.plot(result,"ro",ms=15,label="预测值")
#绘制真实值
plt.plot(y_test.values,'go',label="真实值")
plt.title("逻辑回归")
plt.xlabel("样本序号")
plt.ylabel("类别")
plt.legend()

<matplotlib.legend.Legend at 0x205636e7488>

在这里插入图片描述

#绘制损失值函数
plt.plot(range(1,reg.times+1),reg.loss_,"go-")

[<matplotlib.lines.Line2D at 0x2056367a408>]

在这里插入图片描述

	SepalLength	SepalWidth	PetalLength	PetalWidth	Name
136	6.3	3.4	5.6	2.4	2
109	7.2	3.6	6.1	2.5	2
13	4.3	3.0	1.1	0.1	1
70	5.9	3.2	4.8	1.8	0
17	5.1	3.5	1.4	0.3	1
4	5.0	3.6	1.4	0.2	1
87	6.3	2.3	4.4	1.3	0
86	6.7	3.1	4.7	1.5	0
67	5.8	2.7	4.1	1.0	0
135	7.7	3.0	6.1	2.3	2
54	6.5	2.8	4.6	1.5	0
83	6.0	2.7	5.1	1.6	0
21	5.1	3.7	1.5	0.4	1
88	5.6	3.0	4.1	1.3	0
22	4.6	3.6	1.0	0.2	1
78	6.0	2.9	4.5	1.5	0
108	6.7	2.5	5.8	1.8	2
118	7.7	2.6	6.9	2.3	2
128	6.4	2.8	5.6	2.1	2
82	5.8	2.7	3.9	1.2	0

	SepalLength	SepalWidth	PetalLength	PetalWidth	Name
136	6.3	3.4	5.6	2.4	2
109	7.2	3.6	6.1	2.5	2
13	4.3	3.0	1.1	0.1	1
70	5.9	3.2	4.8	1.8	0
17	5.1	3.5	1.4	0.3	1
4	5.0	3.6	1.4	0.2	1
87	6.3	2.3	4.4	1.3	0
86	6.7	3.1	4.7	1.5	0
67	5.8	2.7	4.1	1.0	0
135	7.7	3.0	6.1	2.3	2
54	6.5	2.8	4.6	1.5	0
83	6.0	2.7	5.1	1.6	0
21	5.1	3.7	1.5	0.4	1
88	5.6	3.0	4.1	1.3	0
22	4.6	3.6	1.0	0.2	1
78	6.0	2.9	4.5	1.5	0
108	6.7	2.5	5.8	1.8	2
118	7.7	2.6	6.9	2.3	2
128	6.4	2.8	5.6	2.1	2
82	5.8	2.7	3.9	1.2	0

	SepalLength	SepalWidth	PetalLength	PetalWidth	Name
136	6.3	3.4	5.6	2.4	2
109	7.2	3.6	6.1	2.5	2
13	4.3	3.0	1.1	0.1	1
70	5.9	3.2	4.8	1.8	0
17	5.1	3.5	1.4	0.3	1
4	5.0	3.6	1.4	0.2	1
87	6.3	2.3	4.4	1.3	0
86	6.7	3.1	4.7	1.5	0
67	5.8	2.7	4.1	1.0	0
135	7.7	3.0	6.1	2.3	2
54	6.5	2.8	4.6	1.5	0
83	6.0	2.7	5.1	1.6	0
21	5.1	3.7	1.5	0.4	1
88	5.6	3.0	4.1	1.3	0
22	4.6	3.6	1.0	0.2	1
78	6.0	2.9	4.5	1.5	0
108	6.7	2.5	5.8	1.8	2
118	7.7	2.6	6.9	2.3	2
128	6.4	2.8	5.6	2.1	2
82	5.8	2.7	3.9	1.2	0