《机器学习实战》——Logistic回归_对西瓜数据集使用logistic分类函数进行分类的代码-优快云博客

本文整理了《机器学习实战》中Logistic回归的知识，并利用该算法对《机器学习》一书中西瓜数据集进行了分类实践，分享了梯度上升与随机梯度上升算法的应用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

这是《机器学习实战》中的第五章Logistic回归知识的整理以及自己的一些私人理解，之后运用原理对周志华的《机器学习》中的西瓜数据进行分类。（PS：因为上述两本书以及网易公开课上的斯坦福的机器学习视频都在同时看，所以博客可能有点杂。）最后希望给一起学习机器学习的同学一些帮助。资源也已经上传了，名称叫做Logistic回归笔记及代码。链接为:http://download.youkuaiyun.com/detail/qq_30091945/9770127如果有人要转载，请注明：http://blog.youkuaiyun.com/qq_30091945

这里写图片描述

《机器需学习实战》中的代码如下:

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

import numpy as np
import matplotlib.pyplot as plt

def sigmiod(x):
    '''
        Logistic回归模型的Sigmiod函数
    '''
    function = 1.0/(1+np.exp(-x))
    return function

def loadDataSet():
    '''
        这是进行加载数据的函数
        strip()是删除\t,\n,\r,' '的方法
    '''
    dataMat = []
    labelMat = []
    f = open("C:\\Users\\Administrator\\Desktop\\machinelearninginaction\\Ch05\\testSet.txt")
    for line in f.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
    return dataMat,labelMat

def gradAscent(dataMatIn,classLabels,alpha,maxCycles):
    """
        这是梯度上升算法的函数
        alpha梯度上升算法中的学习系数 
        maxCycles进行迭代的最大次数
    """

    '''
        把数据转换成Numpy矩阵
    '''
    dataMatrix = np.mat(dataMatIn)
    labelMat = np.mat(classLabels).transpose()

    m,n = np.shape(dataMatrix)          #m,n是测试数据的行数与列数
    weights = np.ones((n,1))
    for k in range(maxCycles):
        h = sigmiod(dataMatrix*weights)
        error = (labelMat-h)
        weights = weights + alpha *dataMatrix.transpose()*error
    return weights

def RandomGradAscent(dataMatIn,classLabels,maxCycles):
    m,n = np.shape(dataMatIn)
    weights = np.ones(n)
    for j in range(maxCycles):
        dataIndex = np.arange(m)
        for i in range(m):
            alpha = 4/(1.0+i+j)+0.001
            randomIndex = int(np.random.uniform(0,len(dataIndex)))
            h = sigmiod(sum(dataMatIn[randomIndex]*weights))
            error = classLabels[randomIndex] - h
            weights = weights + alpha * error * dataMatIn[randomIndex]
            np.delete(dataIndex,dataIndex[randomIndex])
    return weights

def plotBestFit(weights):
    '''
        这是数据可视化的函数
        画最佳拟合直线
    '''

    dataMat,labelMat = loadDataSet()
    print("最佳系数为:")
    print(weights)

    dataArr = np.mat(dataMat)
    n = np.shape(dataArr)[0]

    """
        x1,y1存放分类为1的数据
        x2,y2存放分类为0的数据
    """

    x1 = []
    y1 = []
    x2 = []
    y2 = []
    for i in range(n):
        if int(labelMat[i]) == 1:
            x1.append(dataArr[i,1])
            y1.append(dataArr[i,2])
        else:
            x2.append(dataArr[i,1])
            y2.append(dataArr[i,2])

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x1,y1,s = 30,c='red',marker = 's')
    ax.scatter(x2,y2,s = 30,c='green')
    x = np.arange(-3.0,3.0,0.1)
    y = (-weights[0]-weights[1]*x)/weights[2]
    ax.plot(x,y,c='blue')
    plt.xlabel("X1")
    plt.ylabel("X2")
    plt.show()

def run_main():
    '''
        这是主函数
    '''

    dataMat,labelMat = loadDataSet()

    """
        下面是梯度上升算法得到的分类
    """
    weights1 = gradAscent(dataMat,labelMat,0.001,1000)
    plotBestFit(weights1.getA())

    """
        下面是随机梯度上升算法得到的分类
    """
    weights2 = RandomGradAscent(np.array(dataMat),labelMat,1000)
    plotBestFit(weights2)

if __name__ == '__main__':
    run_main()

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

这里写图片描述

下面是运用Logistic回归算法对周志华的《机器学习》中的西瓜数据集进行分类的代码(PS:遗憾的是把迭代次数换了又换但最低的错误率都有将近30%，想了一下，毕竟西瓜数据只有17组，而且我只选取了其中密度以及含糖率两个特征，造成误差很大也在所难免了。)

# -*- coding: utf-8 -*-
"""
Created on Fri Mar  3 14:15:43 2017

@author: Administrator
"""

import numpy as np
import matplotlib.pyplot as plt

def sigmoid(x):
    '''
        Sigmiod函数
    '''
    function = 1.0/(1+np.exp(-x))
    return function

def LoadDataSet():
    '''
        导入西瓜数据
    '''
    file = 'D:\\Program Files (x86)\\机器学习\\周志华机器学习\\WatermelonDataSet.txt'
    ftrain = open(file)
    trainingset = []
    labelset = []
    for line in ftrain.readlines():
        LineArr = line.strip().split(',')
        trainingset.append([1.0,float(LineArr[7]),float(LineArr[8])])
        if ("是" == LineArr[9]):
            labelset.append(1.0)
        else:
            labelset.append(0.0)
    return trainingset, labelset

def RandomGradDscent(trainingset,labelset,maxcircle):
    '''
        随机梯度下降算法函数
        alpha是学习速率，maxcircle是迭代次数
        trainingset是训练数据集,labelset是数据对应的标记集
    '''
    row,col = np.shape(trainingset)
    weights = np.ones(col)
    for j in range(maxcircle):
        DataIndex = np.arange(row)
        for i in range(row):
            alpha = 4.0/(i+j+1.0) + 0.01
            randomindex = int(np.random.uniform(0,len(DataIndex)))
            h = sigmoid(sum(trainingset[randomindex]*weights))
            error = labelset[randomindex] - h
            weights = weights + alpha * trainingset[randomindex]*error
            np.delete(DataIndex,DataIndex[randomindex])
    return weights

def plotBestFit(weights):
    '''
        这是数据可视化的函数
        画最佳拟合直线
    '''

    trainingset,labelset = LoadDataSet()

    dataArr = np.mat(trainingset)
    n = np.shape(dataArr)[0]

    """
        x1,y1存放分类为1的数据
        x2,y2存放分类为0的数据
    """

    x1 = []
    y1 = []
    x2 = []
    y2 = []
    for i in range(n):
        if int(labelset[i]) == 1:
            x1.append(dataArr[i,1])
            y1.append(dataArr[i,2])
        else:
            x2.append(dataArr[i,1])
            y2.append(dataArr[i,2])

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x1,y1,s = 30,c='red',marker = 's')
    ax.scatter(x2,y2,s = 30,c='green')
    x = np.arange(0.0,1.0,0.1)
    y = (-weights[0]-weights[1]*x)/weights[2]
    ax.plot(x,y,c='blue')
    plt.xlabel("X1")
    plt.ylabel("X2")
    plt.show()

def ClassifyVector(X,weights):
    """
        判断分类的函数
    """
    result = sigmoid(sum(X*weights))
    flag = 0;
    if result > 0.5:
        flag = 1;
    else:
        flag = 0;
    return flag

def ErrorRate(trainingset,weights,labelset,maxcircle):
    errorcount = 0
    n = np.shape(trainingset)[0]
    for i in range(n):
        if ClassifyVector(np.array(trainingset[i]),weights) != labelset[i]:
            errorcount = errorcount + 1
    errorrate = errorcount*1.0/n
    return errorrate

def gradAscent(dataMatIn,classLabels,alpha,maxCycles):
    """
        这是梯度上升算法的函数
        alpha梯度上升算法中的学习系数 
        maxCycles进行迭代的最大次数
    """

    '''
        把数据转换成Numpy矩阵
    '''
    dataMatrix = np.mat(dataMatIn)
    labelMat = np.mat(classLabels).transpose()

    m,n = np.shape(dataMatrix)          #m,n是测试数据的行数与列数
    weights = np.ones((n,1))
    for k in range(maxCycles):
        h = sigmoid(dataMatrix*weights)
        error = (labelMat-h)
        weights = weights + alpha *dataMatrix.transpose()*error
    return weights

def run_main():
    """
        这是主函数
    """
    trainingset,labelset = LoadDataSet()
    maxcircle1 = np.arange(150,300,5)
    maxcircle2 = np.arange(500,580,5)


    print("以下是应用随机梯度上升算法的分类")
    for i in maxcircle1:
        weights = RandomGradDscent(np.array(trainingset),labelset,i)
        print("最佳系数为:")
        print(weights)
        errorrate = ErrorRate(trainingset,weights,labelset,i)
        print("迭代次数为%d时,错误率为:%f" %(i,errorrate))
        plotBestFit(weights)

    print("以下是应用梯度上升算法的分类")
    for i in maxcircle2:
        weights = gradAscent(trainingset,labelset,0.001,i)
        print("最佳系数为:")
        print(weights)
        errorrate = ErrorRate(trainingset,weights,labelset,i)
        print("迭代次数为%d时,错误率为:%f" %(i,errorrate))
        plotBestFit(weights.getA())

if __name__ == '__main__':
    run_main()