题目:利用水库的水位变化预测大坝的出水量
代码:
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from scipy.optimize import minimize
def linear(): # 线性回归
fig, ax = plt.subplots()
ax.scatter(X_train[:, 1], y_train)
ax.set(xlabel = 'water level',
ylabel = 'flowing data')
return fig, ax
def reg_cost(theta, X, y, l):#正则化代价函数
cost = np.sum(np.power((X@theta - y.flatten()), 2))
reg = theta[1:]*l
# reg = l * np.sum(np.power(theta[1:], 2))
total_cost = (cost + np.sum(reg)) / (2 * len(X))
assert not np.isnan(total_cost) and not np.isinf(total_cost), "Cost function returned NaN or inf"
return total_cost
def reg_gradient(theta, X, y, l): #正则化梯度
grad = (X@theta-y.flatten())@X
reg = l*theta
reg[0] = 0
return(grad + reg)/(len(X))
def train_model(X, y, l):
theta = np.ones(X.shape[1])
res = minimize(fun=reg_cost,
x0=theta,
args=(X, y, l),
method='TNC',
jac=reg_gradient)
return res.x
def learning_curve(X_train, y_train, X_val, y_val, l):
x = range(1, len(X_train)+1) #由于range()左闭右开取不到最后一个数,所以在最后要+1
training_cost = [] # 训练集代价函数
cv_cost = [] #验证集代价函数
for i in x:
res = train_model(X_train[:i, :], y_train[:i, :], l)
training_cost_i = reg_cost(res, X_train[:i, :], y_train[:i, :],l)
cv_cost_i = reg_cost(res, X_val, y_val, l)
training_cost.append(training_cost_i)
cv_cost.append(cv_cost_i)
plt.plot(x, training_cost, label='training cost')
plt.plot(x, cv_cost, label='cv cost')
plt.legend()
plt.xlabel('training numbers')
plt.ylabel('error')
plt.show()
def feature_solution(X, power):
for i in range(2, power+1):
X = np.insert(X, X.shape[1], np.power(X[:, 1], i), axis=1)# 每循环一次就在原来的函数基础上加多一次i次项
return X
def get_means_and_stds(X): #求均值和方差
means = np.mean(X, axis=0)
stds = np.std(X, axis=0)
return means, stds
def feature_normalize(X, means, stds):# 由于出现高次方项时可能值与值之间相差太大,所以作归一化处理
X[:, 1:] = (X[:, 1:]-means[1:]) / stds[1:] #不去全为1的第一列
return X
def plot_fit(power, theta_fit, train_means, train_stds):# 用多项式拟合曲线
linear()
x = np.linspace(-60,60,100)#在-60到60之间随机生成100个点
xx = x.reshape(100,1)# 保证后续计算时维度匹配
xx = np.insert(xx, 0, 1, axis=1)#为了把多项式放到最后一列
xx = feature_solution(xx, power)
xx = feature_normalize(xx, train_means, train_stds)
plt.plot(x, xx@theta_fit, 'r--')
plt.show()
data = sio.loadmat('ex5data1.mat')
print(data.keys())
X_train = data['X']
y_train = data['y']
print(X_train.shape)
print(y_train.shape)
X_val = data['Xval']
y_val = data['yval']
print(X_val.shape)
print(y_val.shape)
X_test = data['Xtest']
y_test = data['ytest']
print(X_test.shape)
print(y_test.shape)
X_train = np.insert(X_train, 0, 1, axis=1)
X_val = np.insert(X_val, 0, 1, axis=1)
X_test = np.insert(X_test, 0, 1, axis=1)
# X_train = np.isnan(X_train).any()
# y_train = np.isinf(y_train).any()
linear()
plt.show()
theta = np.ones(X_train.shape[1])
reg__cost = reg_cost(theta, X_train, y_train, l=1)
print(reg__cost)
reg__gradient = reg_gradient(theta, X_train, y_train, l=1)
print(reg__gradient)
theta_final = train_model(X_train, y_train, l=0)
fig, ax = linear()
plt.plot(X_train[:, 1], X_train@theta_final, c='r')
plt.show()# 线性回归
learning_curve(X_train, y_train, X_val, y_val, l=0)
# print(compare) #比较训练集和验证集的误差,判断是否出现高偏差或高方差
power = 6
X_train_poly = feature_solution(X_train, power)
X_val_poly = feature_solution(X_val, power)
X_test_poly = feature_solution(X_test, power)
train_means, train_stds = get_means_and_stds(X_train_poly)
print(train_means.shape)
print(train_stds.shape)
X_train_norm = feature_normalize(X_train_poly, train_means, train_stds)
X_val_norm = feature_normalize(X_val_poly, train_means, train_stds)#这里用训练集的方差和均值是因为三个数据集是先打乱再分的,原来是属于同一数据集
X_test_norm = feature_normalize(X_test_poly, train_means, train_stds)
theta_fit = train_model(X_train_norm, y_train, l=0)
plot_fit(power, theta_fit, train_means, train_stds)
#观察参数lambda取不同的值两个数据集误差的变化,方便后续取值
learning_curve(X_train_norm, y_train, X_val_norm, y_val, l=0)
learning_curve(X_train_norm, y_train, X_val_norm, y_val, l=1)
learning_curve(X_train_norm, y_train, X_val_norm, y_val, l=100)
lambdas = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost = []
cv_cost = []
for l in lambdas:
res = train_model(X_train_norm, y_train, l)#求最优参数
tc = reg_cost(res, X_train_norm, y_train, l=0)#只在训练样本时采用正则化
cv = reg_cost(res, X_val_norm, y_val, l=0)
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(lambdas, training_cost, label='training cost')
plt.plot(lambdas, cv_cost, label='cv cost')
plt.legend()
plt.show()
min_l = lambdas[np.argmin(cv_cost)]# 虽然用肉眼能看到验证集误差的最小值点,用argmin来算严谨一点
print(min_l)
输出:
dict_keys(['__header__', '__version__', '__globals__', 'X', 'y', 'Xtest', 'ytest', 'Xval', 'yval'])
(12, 1)
(12, 1)
(21, 1)
(21, 1)
(21, 1)
(21, 1)
303.9931922202643
[-15.30301567 598.25074417]
(7,)
(7,)
3
原始数据散点图
线性回归模拟图
训练集和验证集的代价函数误差
用多项式拟合原始数据
lambda=0时训练集和验证集的误差比较
lambda=1时训练集和验证集的误差比较
lambda=100时训练集和验证集的误差比较
用最优参数theta求得的误差
总结:在分三种数据集后,正则化一般只在训练集使用(在训练集正则化后参数就已经被筛选好了,后续直接使用就行,不需要做二次筛选);有时候由于取值太大,在计算代价函数时会出现Nan的情况,可以在计算cost函数里加入定点的模式,在计算到Nan时返回存在Nan;画图时不同横纵轴的图形之间要注意用plt.show()隔开,否则会出现图像重叠的情况。