0.逻辑回归概念
- 目的:解决分类问题。
- 实际:将连续值通过sigmoid函数映射到[0,1]之间,该值为分类为1的概率。p>=0.5判定为类别1,否则判定为类别0。
- 是广义的线性回归。相当于把线性回归算法的目标函数值映射到[0,1]之间,进行分类。
详细概念

1. 导入鸢尾花数据集
import numpy as np
import pandas as pd
data=pd.read_csv("data/iris.csv")
data.drop_duplicates(inplace=True)
data["Name"]=data["Name"].map({"Iris-versicolor":0,"Iris-setosa":1,"Iris-virginica":2})
| SepalLength | SepalWidth | PetalLength | PetalWidth | Name |
|---|
| 136 | 6.3 | 3.4 | 5.6 | 2.4 | 2 |
|---|
| 109 | 7.2 | 3.6 | 6.1 | 2.5 | 2 |
|---|
| 13 | 4.3 | 3.0 | 1.1 | 0.1 | 1 |
|---|
| 70 | 5.9 | 3.2 | 4.8 | 1.8 | 0 |
|---|
| 17 | 5.1 | 3.5 | 1.4 | 0.3 | 1 |
|---|
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | 1 |
|---|
| 87 | 6.3 | 2.3 | 4.4 | 1.3 | 0 |
|---|
| 86 | 6.7 | 3.1 | 4.7 | 1.5 | 0 |
|---|
| 67 | 5.8 | 2.7 | 4.1 | 1.0 | 0 |
|---|
| 135 | 7.7 | 3.0 | 6.1 | 2.3 | 2 |
|---|
| 54 | 6.5 | 2.8 | 4.6 | 1.5 | 0 |
|---|
| 83 | 6.0 | 2.7 | 5.1 | 1.6 | 0 |
|---|
| 21 | 5.1 | 3.7 | 1.5 | 0.4 | 1 |
|---|
| 88 | 5.6 | 3.0 | 4.1 | 1.3 | 0 |
|---|
| 22 | 4.6 | 3.6 | 1.0 | 0.2 | 1 |
|---|
| 78 | 6.0 | 2.9 | 4.5 | 1.5 | 0 |
|---|
| 108 | 6.7 | 2.5 | 5.8 | 1.8 | 2 |
|---|
| 118 | 7.7 | 2.6 | 6.9 | 2.3 | 2 |
|---|
| 128 | 6.4 | 2.8 | 5.6 | 2.1 | 2 |
|---|
| 82 | 5.8 | 2.7 | 3.9 | 1.2 | 0 |
|---|
data=data[data["Name"]!=2]
2. 逻辑回归算法
class LogisticRegression:
def __init__(self,alpha,times):
self.alpha=alpha
self.times=times
def sigmoid(self,z):
"""
sigmoid函数的实现。
paremeters:
----
z:float 自变量,值为:z=w.T*x
returns:
----
p:float,值为[0,1].
返回样本属于类别1的概率,用来作为结果的预测。
当s>=0.5(z>=0)时,判定为类别1;否则判定为类别0.
"""
return 1.0/(1.0+np.exp(-z))
def fit(self,X,y):
self.X=np.asarray(X)
self.y=np.asarray(y)
self.w_=np.zeros(1+X.shape[1])
self.loss_=[]
for i in range(self.times):
z=np.dot(X,self.w_[1:])+self.w_[0]
p=self.sigmoid(z)
cost=-np.sum(y*np.log(p)+(1-y)*np.log(1-p))
self.loss_.append(cost)
self.w_[0]+=self.alpha*np.sum(y-p)
self.w_[1:]+=self.alpha*np.dot(X.T,y-p)
def predict_proba(self,X):
"""返回预测为1和0类型的概率"""
X=np.asarray(X)
z=np.dot(X,self.w_[1:])+self.w_[0]
p=self.sigmoid(z)
p=p.reshape(-1,1)
return np.concatenate([1-p,p],axis=1)
def predict(self,X):
return np.argmax(self.predict_proba(X),axis=1)
3.数据切分
t1=data[data["Name"]==0]
t2=data[data["Name"]==1]
t1=t1.sample(len(t1),random_state=666)
t2=t2.sample(len(t2),random_state=666)
X_train=pd.concat([t1.iloc[:40,:-1],t2.iloc[:40,:-1]],axis=0)
y_train=pd.concat([t1.iloc[:40,-1],t2.iloc[:40,-1]],axis=0)
X_test=pd.concat([t1.iloc[40:,:-1],t2.iloc[40:,:-1]],axis=0)
y_test=pd.concat([t1.iloc[40:,-1],t2.iloc[40:,-1]],axis=0)
4.逻辑回归进行二分类
reg=LogisticRegression(alpha=0.001,times=20)
reg.fit(X_train,y_train)
reg.predict_proba(X_test)
array([[0.70002701, 0.29997299],
[0.83689443, 0.16310557],
[0.79590141, 0.20409859],
[0.76391151, 0.23608849],
[0.7577496 , 0.2422504 ],
[0.79310386, 0.20689614],
[0.74918987, 0.25081013],
[0.75201187, 0.24798813],
[0.81234813, 0.18765187],
[0.77947483, 0.22052517],
[0.30889335, 0.69110665],
[0.28541285, 0.71458715],
[0.31146719, 0.68853281],
[0.29001408, 0.70998592],
[0.33239525, 0.66760475],
[0.30123427, 0.69876573],
[0.28658295, 0.71341705],
[0.26853876, 0.73146124]])
result=reg.predict(X_test)
np.sum(result==y_test)/len(y_test)
1.0
5. 可视化展示
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams["font.family"]="SimHei"
mpl.rcParams["axes.unicode_minus"]=False
plt.plot(result,"ro",ms=15,label="预测值")
plt.plot(y_test.values,'go',label="真实值")
plt.title("逻辑回归")
plt.xlabel("样本序号")
plt.ylabel("类别")
plt.legend()
<matplotlib.legend.Legend at 0x205636e7488>

plt.plot(range(1,reg.times+1),reg.loss_,"go-")
[<matplotlib.lines.Line2D at 0x2056367a408>]
