梯度下降——Adam算法、Nadam算法实现

Megumi_Ben

于 2025-02-18 14:28:17 发布

阅读量349

点赞数 7

文章标签：算法机器学习深度学习线性回归 python

本文链接：https://blog.youkuaiyun.com/Megumi_Ben/article/details/145704870

版权

梯度下降算法

梯度下降简析

Nadam

算法实现

import numpy as np
import matplotlib.pyplot as plt
import time

# 出现NaN时（一般也随着theta左右摇摆，但是误差越来越大，这是步长太大导致的），可适当减少步长alpha

num_params=3
x1_data = np.loadtxt('x1.txt', delimiter=',')
m=len(x1_data)
x1 = np.array(x1_data[0:m]).reshape(m, 1)
x2_data = np.loadtxt('x2.txt', delimiter=',')
x2 = np.array(x2_data[0:m]).reshape(m, 1)
x3_data = np.loadtxt('x3.txt', delimiter=',')
x3 = np.array(x3_data[0:m]).reshape(m, 1)
# x0为1
x = np.ones((m, 1))
# 设置三个特征向量取值数组的拼接
x = np.concatenate((x, x1.reshape(m, 1), x2.reshape(m,1), x3.reshape(m,1)), axis=1)
y_data = np.loadtxt('data.txt', delimiter=',')
# 从数组提取y的取值
y = np.array(y_data[0:m]).reshape(m, 1)
# 设置学习效率
alpha = 0.001
# 设置模型参数theta的初始迭代点(一般都是从0开始)
theta = np.zeros((num_params+1, 1))
# 分别定义一个存储损失函数值和迭代次数的列表
costlist, num = [], []


# 定义一个代价函数
def costfunction(theta, x, y):
    error = np.dot(x, theta) - y
    return (1 / (2 * m)) * (np.dot(error.T, error))


# 定义一个梯度迭代的函数，即偏导数部分,这里求法是对应线性回归的场景
def gradientdescent(theta, x, y):
    error = np.dot(x, theta) - y
    return (1 / m) * (np.dot(x.T, error))

def adam_update(theta, gd, m, v, t,beta1t,beta2t, lr=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    ...

def nadam_update(theta, gd, m, v, t,beta1t,beta2t, lr=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    ...

# 下降迭代过程函数
def GDworkfunction(theta, alpha, x, y):
    gd = gradientdescent(theta, x, y)
    mt=np.zeros((num_params+1, 1)).reshape(-1)
    vt=np.zeros((num_params+1,1)).reshape(-1)
    count=0
    beta1t=1
    beta2t=1
    costlow=float("inf")
    besttheta=theta
    # 迭代10万次终止
    for i in range(200000):
        count+=1
        num.append(i)
        # theta = theta - alpha * gd # 原始算法
        (theta,mt,vt,beta1t,beta2t) = adam_update(theta=theta, gd=gd,m=mt,v=vt,t=i+1,beta1t=beta1t,beta2t=beta2t) # Adam算法
        # (theta,mt,vt,beta1t,beta2t) = nadam_update(theta=theta, gd=gd,m=mt,v=vt,t=i+1,beta1t=beta1t,beta2t=beta2t) # Nadam算法
        cost1 = costfunction(theta, x, y)
        costlist.append(cost1)
        if cost1<costlow:
            costlow=cost1[0][0]
            besttheta=theta
        if cost1 < 1e-27:
            break
        gd = gradientdescent(theta, x, y)
    return (besttheta,count,costlow)

start_time = time.time()
(consulttheta,count,costlow) = GDworkfunction(theta, alpha, x, y)
end_time = time.time()
print('迭代时间为{}'.format(end_time - start_time))
print(f"迭代次数为{count}")
print('cost值取得最小时的函数为：y={}+{}*x1+{}*x2+{}*x3'.format(consulttheta[0], consulttheta[1], consulttheta[2], consulttheta[3]))
cost = costfunction(consulttheta, x, y)[0][0]
print('此时的最小损失值为{}'.format(costlow))
costlist1 = []
# 对cost值进行特征缩小，缩小的范围自己主观决定，怎么方便怎么来！
for j in costlist:
    costlist1.append(j)
print(np.squeeze(np.array(costlist1)))
len=len(costlist1)

Adam

def adam_update(theta, gd, m, v, t,beta1t,beta2t, lr=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    m=m.reshape(-1)
    v=v.reshape(-1)
    theta=theta.reshape(-1)
    gd=gd.reshape(-1)
    beta1t*=beta1
    beta2t*=beta2
    m=(beta1) * m + (1-beta1) * gd
    v=(beta2) * v + (1-beta2) * (gd ** 2)
    
    m_corrected = m / (1 - beta1t)
    v_corrected = v / (1 - beta2t)
    # rate = lr * np.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)
    rate=lr
    param_update = rate * m_corrected / (np.sqrt(v_corrected) + epsilon)
    theta=theta-param_update
    theta=theta.reshape(4,1) # 改变为原来形状
    return (theta,m,v,beta1t,beta2t)

Nadam

def nadam_update(theta, gd, m, v, t,beta1t,beta2t, lr=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    m=m.reshape(-1)
    v=v.reshape(-1)
    theta=theta.reshape(-1)
    gd=gd.reshape(-1)
    beta1t*=beta1
    beta2t*=beta2
    m=(beta1) * m + (1-beta1) * gd
    # v=max((beta2) * v + (1-beta2) * (gd ** 2),v)
    v=(beta2) * v + (1-beta2) * (gd ** 2)
    m_corrected = m / (1 - beta1t)
    v_corrected = v / (1 - beta2t)
    # rate = lr * np.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)
    rate=lr
    temp=beta1 * m_corrected + (1 - beta1) * gd / (1 - beta1t)
    param_update = rate * temp / (np.sqrt(v_corrected) + epsilon)
    theta=theta-param_update
    theta=theta.reshape(4,1) # 改变为原来形状
    return (theta,m,v,beta1t,beta2t)

数据可视化

plt.figure(num=3, figsize=(10, 6))
# 在绘图时，需要对cost值进行降维成一维，因为保存的是二维的数组
plt.plot(num, np.squeeze(np.array(costlist1)))
plt.xlim((1, len))
plt.ylim(0, 1e-20)
plt.xlabel('iterate_nums')
plt.ylabel('costvalue')
# 设置x轴刻度表示范围
my_x_ticks = np.arange(1, len, len//10)
plt.xticks(my_x_ticks)
plt.savefig('BGD.png')  # 保存图片
plt.show()

生成模拟数据

import numpy as np
import csv

m=1000
x1=np.random.randint(-30,60,(m,))
x2=np.random.randint(-30,60,(m,))
x3=np.random.randint(-60,60,(m,))

y=8.051+3.151*x1+2.279*x2-0.788*x3
# print(x1)
# print(x2)
# print(x3)
# print(y)
print("-"*50+"x1")
for i in x1:
    print(i,end=",")
print("\n"+"-"*50+"x2")
for i in x2:
    print(i,end=",")
print("\n"+"-"*50+"x3")
for i in x3:
    print(i,end=",")
print("\n"+"-"*50+"y")
for i in y:
    print(i,end=",")
    
with open('x1.txt', 'w', encoding='utf-8') as file:
    content = ",".join(map(str, x1))
    file.write(content)
with open('x2.txt', 'w', encoding='utf-8') as file:
    content = ",".join(map(str, x2))
    file.write(content)
with open('x3.txt', 'w', encoding='utf-8') as file:
    content = ",".join(map(str, x3))
    file.write(content)
with open('data.txt', 'w', encoding='utf-8') as file:
    content = ",".join(map(str, y))
    file.write(content)

x0=np.ones((m,1))
data=np.concatenate((x0,x1.reshape(m, 1), x2.reshape(m,1), x3.reshape(m,1), y.reshape(m,1)), axis=1)
# 写入 CSV 文件
with open('example.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["x0","x1","x2","x3","y"])
    writer.writerows(data)  # 写入多行

总结

Nai
在这里插入图片描述