梯度下降法的实现（超详细）

梯度下降法可视化及应用

原创已于 2024-02-24 13:22:18 修改 · 1.5k 阅读

8 ·

CC 4.0 BY-SA版权

文章标签：

#python #机器学习 #numpy

于 2023-02-20 21:31:27 首次发布

机器学习专栏收录该内容

3 篇文章

订阅专栏

本文详细介绍了梯度下降法的概念，通过Python代码实现并可视化了该方法在寻找函数最小值过程中的应用。利用matplotlib和numpy库，动态展示了不同起始点下的收敛路径，并通过计算二分位数和四分位数选取关键点进行可视化，帮助理解梯度下降法的迭代过程。

梯度下降法（深入理解+可视化）

梯度下降法（Gradient Descent）是一种常用的数值优化方法，用于寻找一个函数的最小值。在机器学习中，梯度下降法被广泛应用于训练神经网络和其他机器学习模型。
为了更好的理解梯度下降法的原理，本文首先使用了一个凹凸不平的函数（同时计算其自变量的梯度），之后进行梯度下降法的迭代过程。与此同时，为了更好的将寻找局部最小值的过程进行可视化，在此采用了动态的生成点的方式对实现梯度下降过程的可视化（分别选取了上下四分位点）。

可视化结果如下：

在本图中可以通过代码的第115行控制是否动态的的生成点，达到动态的寻找最小值的效果。
在这里插入图片描述

具体步骤：

1.导入所需要的包

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

2.使用一个凹凸不平的函数，目的是对梯度下降的过程进行详细的可视化，同时计算函数中参数的梯度。

def loss(x1, x2):
    #  自己找了一个凹凸不平的函数方便观察不同的起始点的收敛路径。
    #  草帽函数
    return np.sin(np.sqrt(x1 ** 2 + x2 ** 2)) * 2 + 2

def gradient(x1, x2):
    if (np.sqrt(x1 ** 2 + x2 ** 2)) * np.cos(np.sqrt(x1 ** 2 + x2 ** 2)) != 0:
        x1_grad = 2 * x1 / (np.sqrt(x1 ** 2 + x2 ** 2)) * np.cos(np.sqrt(x1 ** 2 + x2 ** 2))
    if (np.sqrt(x1 ** 2 + x2 ** 2)) * np.cos(np.sqrt(x1 ** 2 + x2 ** 2)) != 0:
        x2_grad = 2 * x2 / (np.sqrt(x1 ** 2 + x2 ** 2)) * np.cos(np.sqrt(x1 ** 2 + x2 ** 2))
    return x1_grad, x2_grad

3.梯度下降法的实现过程，具体原理如下。

在这里插入图片描述
在此将参数的变化过程，loss的变化过程，以及迭代的次数进行了保存，方便后续的可视化。对应代码实现如下：

def BGD(a):  # 梯度下降法
    change_rate = a   # 学习率
    #可设置不同的初始值观察迭代的过程
    #x1, x2 = 1.5, -1.5  # 初始值
    x1,x2=2.5,6.5 #这个点不太好
    #x1,x2=6,6
    #x1, x2 = 6, -6

    print('初始参数x1= {} '.format(x1))
    print('初始参数x2= {} '.format(x2))
    end = 10 ** (-10)  # 阈值
    times = 0  # 迭代次数
    x1_list = [x1]
    x2_list = [x2]
    time_list = [0]
    loss_list = [loss(x1, x2)]
    # 在此定义了四个列表来记录梯度变化的过程。
    while True:
        # 迭代过程
        temp1 = change_rate * gradient(x1, x2)[0]
        x1 -= temp1
        temp2 = change_rate * gradient(x1, x2)[1]
        x2 -= temp2
        # 分别将每一次的迭代结果放入对应的列表中
        x1_list.append(x1)
        x2_list.append(x2)
        time_list.append(times)
        loss_list.append(loss(x1, x2))
        times += 1
        # 梯度的变化小于阈值的时候跳出循环（两次迭代之间的差值小于该阈值）
        if abs(temp1) < end and abs(temp2) < end:
            break
    return x1_list, x2_list, loss_list, time_list

4.可视化

为了达到较好的可视化效果，在可视化点的过程中，寻找迭代过程中具有代表性的数字进行可视化（二分位数和四分位数）
代码如下：

def LookforPoint(list1,point):  # 返回最接近分位数的那个点的下标
    length1 = len(list1)
    index1 = 0
    temp = 100000
    for i in range(length1):
        if abs(list1[i] - point) < temp:
            temp = list1[i] - point
            index1 = i
    return index1

附完整代码：

# -*-coding:utf-8-*-
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np


def loss(x1, x2):
    #  自己找了一个凹凸不平的函数方便观察不同的起始点的收敛路径。
    #  草帽函数
    return np.sin(np.sqrt(x1 ** 2 + x2 ** 2)) * 2 + 2

def gradient(x1, x2):
    if (np.sqrt(x1 ** 2 + x2 ** 2)) * np.cos(np.sqrt(x1 ** 2 + x2 ** 2)) != 0:
        x1_grad = 2 * x1 / (np.sqrt(x1 ** 2 + x2 ** 2)) * np.cos(np.sqrt(x1 ** 2 + x2 ** 2))
    if (np.sqrt(x1 ** 2 + x2 ** 2)) * np.cos(np.sqrt(x1 ** 2 + x2 ** 2)) != 0:
        x2_grad = 2 * x2 / (np.sqrt(x1 ** 2 + x2 ** 2)) * np.cos(np.sqrt(x1 ** 2 + x2 ** 2))
    return x1_grad, x2_grad


def BGD(a):  # 梯度下降法
    change_rate = a   # 学习率
    #可设置不同的初始值观察迭代的过程
    #x1, x2 = 1.5, -1.5  # 初始值
    x1,x2=2.5,6.5 #这个点不太好
    #x1,x2=6,6
    #x1, x2 = 6, -6

    print('初始参数x1= {} '.format(x1))
    print('初始参数x2= {} '.format(x2))
    end = 10 ** (-10)  # 阈值
    times = 0  # 迭代次数
    x1_list = [x1]
    x2_list = [x2]
    time_list = [0]
    loss_list = [loss(x1, x2)]
    # 在此定义了四个列表来记录梯度变化的过程。
    while True:
        # 迭代过程
        temp1 = change_rate * gradient(x1, x2)[0]
        x1 -= temp1
        temp2 = change_rate * gradient(x1, x2)[1]
        x2 -= temp2
        # 分别将每一次的迭代结果放入对应的列表中
        x1_list.append(x1)
        x2_list.append(x2)
        time_list.append(times)
        loss_list.append(loss(x1, x2))
        times += 1
        # 梯度的变化小于阈值的时候跳出循环（两次迭代之间的差值小于该阈值）
        if abs(temp1) < end and abs(temp2) < end:
            break
    return x1_list, x2_list, loss_list, time_list


def LookforPoint(list1,point):  # 返回最接近分位数的那个点的下标
    length1 = len(list1)
    index1 = 0
    temp = 100000
    for i in range(length1):
        if abs(list1[i] - point) < temp:
            temp = list1[i] - point
            index1 = i
    return index1

if __name__ == '__main__':
    #可设置不同的学习率观察迭代过程
    # change_rate=[0.1,0.02,0.01]
    # for j in range(len(change_rate)):
    #     x1_list, x2_list, loss_list, time_list = BGD(change_rate[j])
    #     plt.plot(time_list, loss_list, label='change_rate='+str(change_rate[j]))
    #     plt.legend()
    # plt.show()
    a = 0.01 # 学习率
    x1_list, x2_list, loss_list, time_list = BGD(a)
    print('共迭代{}次'.format(len(time_list)))
    print('迭代后的x1={}'.format(x1_list[len(x1_list) - 1]))
    print('迭代后的x2={}'.format(x2_list[len(x2_list) - 1]))
    plt.xlabel('迭代次数')
    plt.ylabel('变化情况')
    plt.plot(time_list, x1_list,label=u'x1参数变化')
    plt.plot(time_list, loss_list,label=u"loss函数变化")
    plt.legend()    # 用来标示不同图形的文本标签图例

    # 绘制图像
    plt.rcParams['font.sans-serif'] = 'SimHei'
    plt.rcParams['axes.unicode_minus'] = False
    fig = plt.figure()
    ax = Axes3D(fig)
    xx1, xx2 = np.meshgrid(np.arange(-8, 8, 0.1), np.arange(-8, 8, 0.1))  # 网格坐标
    z = np.array(loss(xx1, xx2))
    ax.plot_surface(xx1, xx2, z, cmap='rainbow', alpha=0.5)
    ax.view_init(60, -40)  # 观察角度
    # 在此透明度设置为0.5，目的是让画出的点更加明显

    #   开始画点
    xx1, xx2 = np.array(x1_list), np.array(x2_list)
    z = np.array(loss(xx1, xx2))
    k = 0  # 标记点的标签
    length = len(xx1)

    num = loss_list[0] - loss_list[length-1]    # 损失函数最大值与最小值之差
    num1 = num/2+loss_list[length-1]  # 二分位数
    num2 = num1-num/4  # 上四分位数
    num3 = num1+num/4  # 下四分位数
    point1 = LookforPoint(loss_list, num1)  # 找到最接近二分位数的那个点
    point2 = LookforPoint(loss_list, num2)  # 找到最接近上四分位数的那个点
    point3 = LookforPoint(loss_list, num3)  # 找到最接近下四分位数的那个点

    for i in range(length):
        if i == 0 or i == point1 or i == point2 or i == point3 or i == length-1:  # 要展示的五个点
            ax.scatter(xx1[i], xx2[i], loss_list[i], marker='*', color='blue', s=50, alpha=1)
            j = [1, 2, 3, 4, 5]  # 为路径添加标签
            ax.text(xx1[i], xx2[i], loss_list[i], j[k])
            k += 1
            plt.pause(1)   # 可以实现动态的生成点的过程
    res = z[len(xx1) - 1]  # 局部最优解
    print('局部最优解是 {} '.format(res))
    ax.set_xlabel('x1')
    ax.set_ylabel('x2')
    ax.set_zlabel('f(x1,x2)')
    plt.show()