Logistic回归为了寻找一个非线性函数Sigmoid的最佳拟合参数,求解过程由最优化算法完成,使用梯度上升算法和随机梯度上升算法
from numpy import *
# Logistic回归梯度上升优化算法
# 打开并读取文件,将数据和类别标签分开存储
def loadDataSet():
dataMat = []
labelMat = []
fr = open('E:\机器学习\machinelearninginaction\Ch05/testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
# 每行的第一个值X1和第二个值X2分别是俩个属性,预测函数f(x)= wx + b
# 为了方便计算,将X0的值设为1,[1.0, float(lineArr[0]), float(lineArr[1])]
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat
# Sigmod函数
def sigmoid(inX):
return 1.0 / (1 + exp(-inX))
# 寻找最优参数,即最适合的权重
def gradAscent(dataMatIn, classLabels):
# 如果使用本书中的数据集,则特征矩阵为100*3,标签矩阵为100*1,为方便计算,将标签矩阵转置
dataMatrix = mat(dataMatIn)
labelMat = mat(classLabels).transpose()
# print(labelMat)
# 计算矩阵大小
m, n = shape(dataMatrix)
# 设置算法所需参数,移动步长和迭代次数
alpha = 0.001
maxCycles = 500
weights = ones((n, 1))
for k in range(maxCycles):
# 得到范围在0到1之间的数值,100 * 1的矩阵
h = sigmoid(dataMatrix * weights)
# print(h)
error = labelMat - h
# print(error)
weights = weights + alpha * dataMatrix.transpose() * error
return weights
# 测试代码功能
# dataArr, labelArr = loadDataSet()
# print(gradAscent(dataArr, labelArr))
# 画决策边界
def plotBestFit(weights):
import matplotlib.pyplot as plt
dataMat, labelMat = loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []
ycord1 = []
xcord2 = []
ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1])
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1])
ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
# marker='s'设置形状为正方形
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
# 横坐标的初始位置,结束位置和步长
x = arange(-3.0, 3.0, 0.1)
# w0x0 + w1x1 + w2x2 = 0 得出x1与x2的关系
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
# dataArr, labelArr = loadDataSet()
# w = gradAscent(dataArr, labelArr)
# 函数getA()将矩阵转化为数组
# plotBestFit(w.getA())
# 随机梯度上升
# 随机梯度上升算法与梯度上升不同的是,随机梯度算法一次仅用一个样本点来更新回归系数,是一个在线学习算法
def stocGradAscent(dataMatrix, classLabels, numIter=150):
m, n = shape(dataMatrix)
weights = ones(n)
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
# alpha值要更新
alpha = 4 / (1.0 + i + j) + 0.01
# 随机选择样本来更新回归参数
randIndex = int(random.uniform(0, len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del (dataIndex[randIndex])
return weights
# dataArr, labelArr = loadDataSet()
# w = stocGradAscent(array(dataArr), labelArr)
# plotBestFit(w)
实例:从病症预测病马的死亡率
#预测病马的死亡率,用回归进行分类
# 计算sigmoid函数值,给出分类结果
from chapter5.LogRegres import sigmoid
from chapter5.LogRegres import stocGradAscent
from numpy import *
def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
if prob > 0.5:
return 1
else:
return 0
# 打开测试数据集和训练数据集,数据处理
def colicTest():
frTrain = open('E:\机器学习\machinelearninginaction\Ch05\horseColicTraining.txt')
frTest = open('E:\机器学习\machinelearninginaction\Ch05\horseColicTest.txt')
trainingSet = []
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split("\t")
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent(array(trainingSet), trainingLabels, 500)
errorCount = 0.0
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split("\t")
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
errorCount += 1
errcoRate = (float(errorCount) / numTestVec)
print("错误率", errcoRate)
return errcoRate
# 多次调用colicTest(),并求结果的平均值
def multiTest():
numTest = 10
errorCount = 0.0
for k in range(numTest):
errorCount += colicTest()
# errorSum += errorCount
errorRate = errorCount / float(numTest)
print("迭代十次的平均错误率", errorRate)
# multiTest()