首先详细介绍了回归算法,没看过的小伙伴可以点开链接看一下,或者是直接在我的博客中查找。
其次,给大家介绍了线性判定边界的逻辑回归Python代码的实现,主要是通过一个例子,让大家可以自己实现一个逻辑回归分类器。没看过的小伙伴可以点开链接看一下,或者是直接在我的博客中查找。
今天,这一篇根据上两篇的理论和实践知识,我们实现一下逻辑回归中判定边界是非线性函数的情况。
(1)首先我们先准备好我们的数据集,命名为data2.txt。我把数据集贴出来给大家,大家可以自己把数据放到txt文本中,也可以到我的github中下载。
[[ 0.051267 0.69956 1. ]
[-0.092742 0.68494 1. ]
[-0.21371 0.69225 1. ]
[-0.375 0.50219 1. ]
[-0.51325 0.46564 1. ]
[-0.52477 0.2098 1. ]
[-0.39804 0.034357 1. ]
[-0.30588 -0.19225 1. ]
[ 0.016705 -0.40424 1. ]
[ 0.13191 -0.51389 1. ]
[ 0.38537 -0.56506 1. ]
[ 0.52938 -0.5212 1. ]
[ 0.63882 -0.24342 1. ]
[ 0.73675 -0.18494 1. ]
[ 0.54666 0.48757 1. ]
[ 0.322 0.5826 1. ]
[ 0.16647 0.53874 1. ]
[-0.046659 0.81652 1. ]
[-0.17339 0.69956 1. ]
[-0.47869 0.63377 1. ]
[-0.60541 0.59722 1. ]
[-0.62846 0.33406 1. ]
[-0.59389 0.005117 1. ]
[-0.42108 -0.27266 1. ]
[-0.11578 -0.39693 1. ]
[ 0.20104 -0.60161 1. ]
[ 0.46601 -0.53582 1. ]
[ 0.67339 -0.53582 1. ]
[-0.13882 0.54605 1. ]
[-0.29435 0.77997 1. ]
[-0.26555 0.96272 1. ]
[-0.16187 0.8019 1. ]
[-0.17339 0.64839 1. ]
[-0.28283 0.47295 1. ]
[-0.36348 0.31213 1. ]
[-0.30012 0.027047 1. ]
[-0.23675 -0.21418 1. ]
[-0.06394 -0.18494 1. ]
[ 0.062788 -0.16301 1. ]
[ 0.22984 -0.41155 1. ]
[ 0.2932 -0.2288 1. ]
[ 0.48329 -0.18494 1. ]
[ 0.64459 -0.14108 1. ]
[ 0.46025 0.012427 1. ]
[ 0.6273 0.15863 1. ]
[ 0.57546 0.26827 1. ]
[ 0.72523 0.44371 1. ]
[ 0.22408 0.52412 1. ]
[ 0.44297 0.67032 1. ]
[ 0.322 0.69225 1. ]
[ 0.13767 0.57529 1. ]
[-0.0063364 0.39985 1. ]
[-0.092742 0.55336 1. ]
[-0.20795 0.35599 1. ]
[-0.20795 0.17325 1. ]
[-0.43836 0.21711 1. ]
[-0.21947 -0.016813 1. ]
[-0.13882 -0.27266 1. ]
[ 0.18376 0.93348 0. ]
[ 0.22408 0.77997 0. ]
[ 0.29896 0.61915 0. ]
[ 0.50634 0.75804 0. ]
[ 0.61578 0.7288 0. ]
[ 0.60426 0.59722 0. ]
[ 0.76555 0.50219 0. ]
[ 0.92684 0.3633 0. ]
[ 0.82316 0.27558 0. ]
[ 0.96141 0.085526 0. ]
[ 0.93836 0.012427 0. ]
[ 0.86348 -0.082602 0. ]
[ 0.89804 -0.20687 0. ]
[ 0.85196 -0.36769 0. ]
[ 0.82892 -0.5212 0. ]
[ 0.79435 -0.55775 0. ]
[ 0.59274 -0.7405 0. ]
[ 0.51786 -0.5943 0. ]
[ 0.46601 -0.41886 0. ]
[ 0.35081 -0.57968 0. ]
[ 0.28744 -0.76974 0. ]
[ 0.085829 -0.75512 0. ]
[ 0.14919 -0.57968 0. ]
[-0.13306 -0.4481 0. ]
[-0.40956 -0.41155 0. ]
[-0.39228 -0.25804 0. ]
[-0.74366 -0.25804 0. ]
[-0.69758 0.041667 0. ]
[-0.75518 0.2902 0. ]
[-0.69758 0.68494 0. ]
[-0.4038 0.70687 0. ]
[-0.38076 0.91886 0. ]
[-0.50749 0.90424 0. ]
[-0.54781 0.70687 0. ]
[ 0.10311 0.77997 0. ]
[ 0.057028 0.91886 0. ]
[-0.10426 0.99196 0. ]
[-0.081221 1.1089 0. ]
[ 0.28744 1.087 0. ]
[ 0.39689 0.82383 0. ]
[ 0.63882 0.88962 0. ]
[ 0.82316 0.66301 0. ]
[ 0.67339 0.64108 0. ]
[ 1.0709 0.10015 0. ]
[-0.046659 -0.57968 0. ]
[-0.23675 -0.63816 0. ]
[-0.15035 -0.36769 0. ]
[-0.49021 -0.3019 0. ]
[-0.46717 -0.13377 0. ]
[-0.28859 -0.060673 0. ]
[-0.61118 -0.067982 0. ]
[-0.66302 -0.21418 0. ]
[-0.59965 -0.41886 0. ]
[-0.72638 -0.082602 0. ]
[-0.83007 0.31213 0. ]
[-0.72062 0.53874 0. ]
[-0.59389 0.49488 0. ]
[-0.48445 0.99927 0. ]
[-0.0063364 0.99927 0. ]
[ 0.63265 -0.030612 0. ]](2)有了数据集,我们来看看数据集在二维坐标下的分布。
from numpy import loadtxt,where
from pylab import scatter, show, legend, xlabel, ylabel
#load the dataset
data = loadtxt("F:/PythonCode/LogisticRegression/data2.txt", delimiter=",")
#可以看出数据是一个二维数组,维度是100*3
print(data)
X = data[:,0:2]
#X存放的是数据的特征,维度是:100*2
# print(X.shape)
y = data[:, 2]
#y存放的是数据的标签,维度是:100*1
# print(y)
pos = where(y == 1)
#pos是y中数据等于1的下标索引
# print(pos)
neg = where(y==0)
#neg是y中数据等于0的下标索引
# print(neg)
#python中数据可视化函数scatter(数据的横坐标向量,数据的纵坐标向量,marker='0'数据以点的形式显示,c='b'数据点是blue颜色)
scatter(X[pos,0],X[pos, 1],marker='o', c='b')
scatter(X[neg,0],X[neg, 1],marker='x', c='r')
#说明二维坐标中o表示Pass,x表示Fail
legend(["y==1","y==0"])
show()
用Python画图如下:
(3)加正则化的逻辑回归模型训练。
我们先看一下,加正则化后的损失函数和梯度。
训练模型和计算精度的代码:
from numpy import *
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import minimize
filename = "F:/PythonCode/LogisticRegression/data2.txt"
def loadDataSet():
# load the dataset
data = loadtxt("F:/PythonCode/LogisticRegression/data2.txt", delimiter=",")
# 拿到X和y
y = np.c_[data[:, 2]]
X = data[:, 0:2]
return data,X,y
def map_feature(x1, x2):
'''''
Maps the two input features to polonomial features.
Returns a new feature array with more features of
X1, X2, X1 ** 2, X2 ** 2, X1*X2, X1*X2 ** 2, etc...
'''
x1.shape =(x1.size,1)
x2.shape =(x2.size,1)
degree =6
mapped_fea = ones(shape=(x1[:,0].size,1))
for i in range(1, degree +1):
for j in range(i +1):
r =(x1 **(i - j))*(x2 ** j)
mapped_fea = append(mapped_fea, r, axis=1)
return mapped_fea
#计算Sigmoid函数
def sigmoid(X):
'''Compute sigmoid function'''
den = 1.0 + exp(-1.0*X)
gz = 1.0/den
return gz
# 定义损失函数
def costFunctionReg(theta, X, y, l):
m = y.size
h = sigmoid(X.dot(theta))
J = -1.0 * (1.0 / m) * (np.log(h).T.dot(y) + np.log(1 - h).T.dot(1 - y)) + (l / (2.0 * m)) * np.sum(np.square(theta[1:]))
if np.isnan(J[0]):
return (np.inf)
return (J[0])
#计算梯度
def compute_grad(theta, X, y, l):
m = y.size
h = sigmoid(X.dot(theta.reshape(-1, 1)))
grad = (1.0 / m) * X.T.dot(h - y) + (l / m) * np.r_[[[0]], theta[1:].reshape(-1, 1)]
return (grad.flatten())
#梯度下降并优化
def gradAscent(XX, y, l):
initial_theta = np.zeros(XX.shape[1])
cost = costFunctionReg(initial_theta, XX, y, l)
print('Cost: \n', cost)
# 最优化 costFunctionReg
res2 = minimize(costFunctionReg, initial_theta, args=(XX, y, l), jac=compute_grad, options={'maxiter': 3000})
return res2
def plotBestFit(data,res2,X,accuracy,l,axes): #画出最终分类的图
# 对X,y的散列绘图
plotData(data, 'Microchip Test 1', 'Microchip Test 2', 'y = 1', 'y = 0', axes=None)
# 画出决策边界
x1_min, x1_max = X[:, 0].min(), X[:, 0].max(),
x2_min, x2_max = X[:, 1].min(), X[:, 1].max(),
xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
h = sigmoid(map_feature(xx1.ravel(), xx2.ravel()).dot(res2.x))
h = h.reshape(xx1.shape)
if axes == None:
axes = plt.gca()
axes.contour(xx1, xx2, h, [0.5], linewidths=1, colors='g');
axes.set_title('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=2), l))
plt.show()
def plotData(data, label_x, label_y, label_pos, label_neg, axes):
# 获得正负样本的下标(即哪些是正样本,哪些是负样本)
neg = data[:, 2] == 0
pos = data[:, 2] == 1
if axes == None:
axes = plt.gca()
axes.scatter(data[pos][:, 0], data[pos][:, 1], marker='+', c='k', s=60, linewidth=2, label=label_pos)
axes.scatter(data[neg][:, 0], data[neg][:, 1], c='y', s=60, label=label_neg)
axes.set_xlabel(label_x)
axes.set_ylabel(label_y)
axes.legend(frameon=True, fancybox=True)
def predict(theta, X):
'''''Predict whether the label
is 0 or 1 using learned logistic
regression parameters '''
m, n = X.shape
p = zeros(shape=(m,1))
h = sigmoid(X.dot(theta.T))
for it in range(0, h.shape[0]):
if h[it]>0.5:
p[it,0]=1
else:
p[it,0]=0
return p
def main():
data, X, y = loadDataSet()
#对给定的两个feature做一个多项式特征的映射
mapped_fea = map_feature(X[:, 0], X[:, 1])
# 决策边界,咱们分别来看看正则化系数lambda太大太小分别会出现什么情况
# Lambda = 0 : 就是没有正则化,这样的话,就过拟合咯
# Lambda = 1 : 这才是正确的打开方式
# Lambda = 100 :正则化项太激进,导致基本就没拟合出决策边界
l = 1
res = gradAscent(mapped_fea, y, l)
print(res)
# 准确率
accuracy = y[where(predict(res.x, mapped_fea) == y)].size / float(y.size)*100.0
#画决策边界
plotBestFit(data, res, X, accuracy, l,axes=None)
if __name__ == '__main__':
main()
这里我们正则化后的参数λ=1,我们来看一下判定边界:
我们在看一下λ=0时,也就是损失函数没有加入正则化项时,过拟合的情况:
我们在来看一下,λ=100时,也就是你训练出来的参数在损失函数中惩罚参数太大,导致欠拟合的情况。
到此,逻辑回归系列的基础知识和项目实践,已经学完。相应的代码和数据集可以到我的github中下载。
github代码和数据集地址:https://github.com/Microstrong0305/machine_learning/tree/master/noLineLogisticRegression
Reference;
http://blog.youkuaiyun.com/han_xiaoyang/article/details/49123419

这篇博客基于之前的逻辑回归和线性判定边界知识,探讨了如何在Python中实现逻辑回归处理非线性数据的情况。通过数据集data2.txt,展示了加正则化的逻辑回归模型训练过程,分析了不同λ值(0和100)对模型过拟合和欠拟合的影响,并提供了相关代码和数据集的GitHub链接。
2126

被折叠的 条评论
为什么被折叠?



