1. 算法思想
寻找使计算误差最小的拟合曲线,可基于梯度下降算法对最小二乘形式的损失函数进行优化,最终获得模型回归系数(w、b)。
优点:计算简单;
缺点:不能拟合非线性数据;
2. 代码实现
1. 采用的数据集是UCI鲍鱼年龄预测数据集
2. 损失函数是均方差损失,优化算法是随机梯度下降算法。
import numpy as np
import matplotlib.pyplot as plt
def load_dataset(file_path):
x = []
y = []
with open(file_path, 'r') as file:
lines = file.readlines()
for line in lines:
one_line = line.strip().split('\t')
one_line = [float(i) for i in one_line]
x.append(one_line[:-1])
y.append(one_line[-1])
return np.array(x), np.array(y)
# 归一化数据集
def dataset_norm(x):
x_min = np.min(x)
x_max = np.max(x)
return (x-x_min)/(x_max-x_min)
# 切分数据集
def split_train_test(x,y,ration=0.8):
split_idx = np.int(x.shape[0]*ration)
x_train = x[:split_idx]
y_train = y[:split_idx]
x_val = x[split_idx:]
y_val = y[split_idx:]
return x_train, y_train, x_val, y_val
# 计算w b的梯度更新值
def delta_w_b(x,y,pred_y):
delta_w = (y-pred_y)*x
delta_b = y-pred_y
return delta_w, delta_b
# 计算损失
def cal_loss(y, pred_y):
return (y-pred_y)**2/2
# 迭代训练
def train_model(epoches, alpha, x_train, y_train, w, b):
loss = []
for epoch in range(epoches):
sum_loss = 0
for idx in range(num_train):
pred_y = np.dot(w, x_train[idx]) + b
delta_w, delta_b = delta_w_b(x_train[idx],y_train[idx],pred_y)
w = w + alpha*delta_w
b = b + alpha*delta_b
sum_loss += cal_loss(y_train[idx], pred_y)
loss.extend([sum_loss/num_train])
print("当前迭代过程的训练损失: ", sum_loss/num_train)
return w, b, loss
# 测试模型
def val_model(x_val, y_val, w, b):
y_val_pred = np.dot(w, x_val.T) + b
print("测试集预测结果的维度:",y_val_pred.shape)
num = y_val_pred.shape[1]
y_val_pred = y_val_pred[0]
val_loss = []
for idx in range(num):
temp_loss = y_val_pred[idx]-y_val[idx]
val_loss.extend([temp_loss])
return val_loss
# 绘制迭代过程的损失曲线
def view_loss(loss):
x_axis = [i for i in range(len(loss))]
plt.plot(x_axis, loss)
plt.show()
if __name__=='__main__':
file_path = "abalone.txt"
x, y = load_dataset(file_path)
print("数据集维度:", x.shape, y.shape)
x = dataset_norm(x)
y = dataset_norm(y)
x_train, y_train, x_val, y_val = split_train_test(x, y)
num_train = x_train.shape[0]
print("训练集维度:",x_train.shape, "测试集维度:",x_val.shape)
# 初始化权重和bias
np.random.seed(1)
w = np.random.random((1,8))
b = np.random.random((1))
print("权重和bias的维度:",w.shape, b.shape)
w, b, train_loss = train_model(300, 0.01, x_train, y_train, w, b)
view_loss(train_loss)
val_loss = val_model(x_val, y_val, w, b)
view_loss(val_loss)