【机器学习练习 5】 - 偏差和方差的权衡

SSWDUT

于 2025-04-03 00:15:00 发布

阅读量896

点赞数 20

分类专栏： 🤖 实战练习集文章标签：机器学习人工智能

本文链接：https://blog.youkuaiyun.com/wangshangshang09/article/details/146946853

版权

🤖 实战练习集专栏收录该内容

9 篇文章

订阅专栏

【机器学习练习 5】 - 偏差和方差的权衡

本章代码涵盖了基于Python的解决方案，用于Coursera机器学习课程的第五个编程练习。

数据集链接链接

import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def load_data():
    """for ex5
    d['X'] shape = (12, 1)
    pandas has trouble taking this 2d ndarray to construct a dataframe, so I ravel
    the results
    """
    d = sio.loadmat('ex5data1.mat')
    return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])

X, y, Xval, yval, Xtest, ytest = load_data()

X, y, Xval, yval, Xtest, ytest

df = pd.DataFrame({'water_level':X, 'flow':y})

sns.lmplot(x='water_level', y='flow', data=df, fit_reg=False, height=7)
plt.show()

在这里插入图片描述

X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]

代价函数

def cost(theta, X, y):
    """
    X: R(m*n), m records, n features
    y: R(m)
    theta : R(n), linear regression parameters
    """
    m = X.shape[0]

    inner = X @ theta - y  # R(m*1)

    # 1*m @ m*1 = 1*1 in matrix multiplication
    # but you know numpy didn't do transpose in 1d array, so here is just a
    # vector inner product to itselves
    square_sum = inner.T @ inner
    cost = square_sum / (2 * m)

    return cost

theta = np.ones(X.shape[1])
cost(theta, X, y)

303.95152555359766

梯度

def gradient(theta, X, y):
    m = X.shape[0]

    inner = X.T @ (X @ theta - y)  # (m,n).T @ (m, 1) -> (n, 1)

    return inner / m

gradient(theta, X, y)

array([-15.30301567, 598.16741084])

正则化梯度

def regularized_gradient(theta, X, y, l=1):
    m = X.shape[0]

    regularized_term = theta.copy()  # same shape as theta
    regularized_term[0] = 0  # don't regularize intercept theta

    regularized_term = (l / m) * regularized_term

    return gradient(theta, X, y) + regularized_term

regularized_gradient(theta, X, y)

array([-15.30301567, 598.25074417])

拟合数据

正则化项 $\lambda=0$

def linear_regression_np(X, y, l=1):
    """linear regression
    args:
        X: feature matrix, (m, n+1) # with incercept x0=1
        y: target vector, (m, )
        l: lambda constant for regularization

    return: trained parameters
    """
    # init theta
    theta = np.ones(X.shape[1])

    # train it
    res = opt.minimize(fun=regularized_cost,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=regularized_gradient,
                       options={'disp': True})
    return res

def regularized_cost(theta, X, y, l=1):
    m = X.shape[0]

    regularized_term = (l / (2 * m)) * np.power(theta[1:], 2).sum()

    return cost(theta, X, y) + regularized_term

theta = np.ones(X.shape[0])

final_theta = linear_regression_np(X, y, l=0).get('x')

b = final_theta[0] # intercept
m = final_theta[1] # slope

plt.scatter(X[:,1], y, label="Training data")
plt.plot(X[:, 1], X[:, 1]*m + b, label="Prediction")
plt.legend(loc=2)
plt.show()

在这里插入图片描述

training_cost, cv_cost = [], []

1.使用训练集的子集来拟合应模型

2.在计算训练代价和交叉验证代价时，没有用正则化

3.记住使用相同的训练集子集来计算训练代价

m = X.shape[0]
for i in range(1, m+1):
#     print('i={}'.format(i))
    res = linear_regression_np(X[:i, :], y[:i], l=0)
    
    tc = regularized_cost(res.x, X[:i, :], y[:i], l=0)
    cv = regularized_cost(res.x, Xval, yval, l=0)
#     print('tc={}, cv={}'.format(tc, cv))
    
    training_cost.append(tc)
    cv_cost.append(cv)

plt.plot(np.arange(1, m+1), training_cost, label='training cost')
plt.plot(np.arange(1, m+1), cv_cost, label='cv cost')
plt.legend(loc=1)
plt.show()

在这里插入图片描述

这个模型拟合不太好, 欠拟合了

创建多项式特征

def prepare_poly_data(*args, power):
    """
    args: keep feeding in X, Xval, or Xtest
        will return in the same order
    """
    def prepare(x):
        # expand feature
        df = poly_features(x, power=power)

        # normalization
        ndarr = normalize_feature(df).as_matrix()

        # add intercept term
        return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)

    return [prepare(x) for x in args]

def poly_features(x, power, as_ndarray=False):
    data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
    df = pd.DataFrame(data)

    return df.as_matrix() if as_ndarray else df

X, y, Xval, yval, Xtest, ytest = load_data()

poly_features(X, power=3)

	f1	f2	f3
0	-15.936758	253.980260	-4047.621971
1	-29.152979	849.896197	-24777.006175
2	36.189549	1309.683430	47396.852168
3	37.492187	1405.664111	52701.422173
4	-48.058829	2309.651088	-110999.127750
5	-8.941458	79.949670	-714.866612
6	15.307793	234.328523	3587.052500
7	-34.706266	1204.524887	-41804.560890
8	1.389154	1.929750	2.680720
9	-44.383760	1969.918139	-87432.373590
10	7.013502	49.189211	344.988637
11	22.762749	518.142738	11794.353058

准备多项式回归数据

扩展特征到 8阶,或者你需要的阶数
使用 归一化 来合并 $x^n$
don’t forget intercept term

def normalize_feature(df):
    """Applies function along input axis(default 0) of DataFrame."""
    return df.apply(lambda column: (column - column.mean()) / column.std())

def prepare_poly_data(*args, power):
    """
    args: keep feeding in X, Xval, or Xtest
        will return in the same order
    """
    def prepare(x):
        # expand feature
        df = poly_features(x, power=power)

        # normalization
        ndarr = normalize_feature(df).to_numpy()  # 修改这里

        # add intercept term
        return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)

    return [prepare(x) for x in args]

X_poly, Xval_poly, Xtest_poly = prepare_poly_data(X, Xval, Xtest, power=8)
X_poly[:3, :]

array([[ 1.00000000e+00, -3.62140776e-01, -7.55086688e-01,
         1.82225876e-01, -7.06189908e-01,  3.06617917e-01,
        -5.90877673e-01,  3.44515797e-01, -5.08481165e-01],
       [ 1.00000000e+00, -8.03204845e-01,  1.25825266e-03,
        -2.47936991e-01, -3.27023420e-01,  9.33963187e-02,
        -4.35817606e-01,  2.55416116e-01, -4.48912493e-01],
       [ 1.00000000e+00,  1.37746700e+00,  5.84826715e-01,
         1.24976856e+00,  2.45311974e-01,  9.78359696e-01,
        -1.21556976e-02,  7.56568484e-01, -1.70352114e-01]])

表格：前 3 行的多项式扩展特征

Feature Index	$x_1^1 x_2^0$ (F10)	$x_1^2 x_2^0$ (F20)	$x_1^3 x_2^0$ (F30)	$x_1^4 x_2^0$ (F40)	$x_1^5 x_2^0$ (F50)	$x_1^1 x_2^1$ (F11)	$x_1^2 x_2^1$ (F21)	$x_1^3 x_2^1$ (F31)	$x_1^4 x_2^1$ (F41)	$x_1^5 x_2^1$ (F51)	$x_1^1 x_2^2$ (F12)	$x_1^2 x_2^2$ (F22)	$x_1^3 x_2^2$ (F32)	$x_1^4 x_2^2$ (F42)	$x_1^5 x_2^2$ (F52)	$x_1^1 x_2^3$ (F13)	$x_1^2 x_2^3$ (F23)	$x_1^3 x_2^3$ (F33)	$x_1^4 x_2^3$ (F43)	$x_1^5 x_2^3$ (F53)	$x_1^1 x_2^4$ (F14)	$x_1^2 x_2^4$ (F24)	$x_1^3 x_2^4$ (F34)	$x_1^4 x_2^4$ (F44)	$x_1^5 x_2^4$ (F54)
0	$x_1^1 x_2^0$	$x_1^2 x_2^0$	$x_1^3 x_2^0$	$x_1^4 x_2^0$	$x_1^5 x_2^0$	$x_1^1 x_2^1$	$x_1^2 x_2^1$	$x_1^3 x_2^1$	$x_1^4 x_2^1$	$x_1^5 x_2^1$	$x_1^1 x_2^2$	$x_1^2 x_2^2$	$x_1^3 x_2^2$	$x_1^4 x_2^2$	$x_1^5 x_2^2$	$x_1^1 x_2^3$	$x_1^2 x_2^3$	$x_1^3 x_2^3$	$x_1^4 x_2^3$	$x_1^5 x_2^3$	$x_1^1 x_2^4$	$x_1^2 x_2^4$	$x_1^3 x_2^4$	$x_1^4 x_2^4$	$x_1^5 x_2^4$
1	$x_1^1 x_2^0$	$x_1^2 x_2^0$	$x_1^3 x_2^0$	$x_1^4 x_2^0$	$x_1^5 x_2^0$	$x_1^1 x_2^1$	$x_1^2 x_2^1$	$x_1^3 x_2^1$	$x_1^4 x_2^1$	$x_1^5 x_2^1$	$x_1^1 x_2^2$	$x_1^2 x_2^2$	$x_1^3 x_2^2$	$x_1^4 x_2^2$	$x_1^5 x_2^2$	$x_1^1 x_2^3$	$x_1^2 x_2^3$	$x_1^3 x_2^3$	$x_1^4 x_2^3$	$x_1^5 x_2^3$	$x_1^1 x_2^4$	$x_1^2 x_2^4$	$x_1^3 x_2^4$	$x_1^4 x_2^4$	$x_1^5 x_2^4$
2	$x_1^1 x_2^0$	$x_1^2 x_2^0$	$x_1^3 x_2^0$	$x_1^4 x_2^0$	$x_1^5 x_2^0$	$x_1^1 x_2^1$	$x_1^2 x_2^1$	$x_1^3 x_2^1$	$x_1^4 x_2^1$	$x_1^5 x_2^1$	$x_1^1 x_2^2$	$x_1^2 x_2^2$	$x_1^3 x_2^2$	$x_1^4 x_2^2$	$x_1^5 x_2^2$	$x_1^1 x_2^3$	$x_1^2 x_2^3$	$x_1^3 x_2^3$	$x_1^4 x_2^3$	$x_1^5 x_2^3$	$x_1^1 x_2^4$	$x_1^2 x_2^4$	$x_1^3 x_2^4$	$x_1^4 x_2^4$	$x_1^5 x_2^4$

说明:

F10, F20, F30, … 是各个特征的编号，表示每一列是 x1 和 x2 的组合。
x1^1 x2^0 表示 x1 的 1 次幂，x2 的 0 次幂，即 x1 本身。
x1^2 x2^1 表示 x1 的 2 次幂和 x2 的 1 次幂，即 x1^2 * x2。
继续按照这个规则，特征的组合会逐步从低次幂扩展到高次幂，直到 8 次幂为止。

如果你已经将 X_poly 等数据转化为多项式特征，前三行数据将呈现类似的格式。

画出学习曲线

首先，我们没有使用正则化，所以 $\lambda=0$

def plot_learning_curve(X, y, Xval, yval, l=0):
    training_cost, cv_cost = [], []
    m = X.shape[0]

    for i in range(1, m + 1):
        # regularization applies here for fitting parameters
        res = linear_regression_np(X[:i, :], y[:i], l=l)

        # remember, when you compute the cost here, you are computing
        # non-regularized cost. Regularization is used to fit parameters only
        tc = cost(res.x, X[:i, :], y[:i])
        cv = cost(res.x, Xval, yval)

        training_cost.append(tc)
        cv_cost.append(cv)

    plt.plot(np.arange(1, m + 1), training_cost, label='training cost')
    plt.plot(np.arange(1, m + 1), cv_cost, label='cv cost')
    plt.legend(loc=1)

plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)
plt.show()

在这里插入图片描述

你可以看到训练的代价太低了，不真实. 这是 过拟合了

try $\lambda=1$

plot_learning_curve(X_poly, y, Xval_poly, yval, l=1)
plt.show()

在这里插入图片描述

训练代价增加了些，不再是0了。
也就是说我们减轻过拟合

try $\lambda=100$

plot_learning_curve(X_poly, y, Xval_poly, yval, l=100)
plt.show()

在这里插入图片描述

太多正则化了.
变成 欠拟合状态

找到最佳的 $\lambda$

l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []

for l in l_candidate:
    res = linear_regression_np(X_poly, y, l)
    
    tc = cost(res.x, X_poly, y)
    cv = cost(res.x, Xval_poly, yval)
    
    training_cost.append(tc)
    cv_cost.append(cv)

plt.plot(l_candidate, training_cost, label='training')
plt.plot(l_candidate, cv_cost, label='cross validation')
plt.legend(loc=2)

plt.xlabel('lambda')

plt.ylabel('cost')
plt.show()

在这里插入图片描述

# best cv I got from all those candidates
l_candidate[np.argmin(cv_cost)]

# use test data to compute the cost
cost_results = {}

for l in l_candidate:
    theta = linear_regression_np(X_poly, y, l).x
    test_cost = cost(theta, Xtest_poly, ytest)
    cost_results[l] = test_cost
    print(f'test cost(l={l}) = {test_cost}')

# 找到最小的 test cost 对应的 l
best_l = min(cost_results, key=cost_results.get)
print(f'最小的 test cost 对应的 λ = {best_l}, cost = {cost_results[best_l]}')

test cost(l=0) = 10.02464287753563
test cost(l=0.001) = 11.03989643825398
test cost(l=0.003) = 11.263612931846453
test cost(l=0.01) = 10.87954944353898
test cost(l=0.03) = 10.022095688423208
test cost(l=0.1) = 8.632065168219597
test cost(l=0.3) = 7.336723122820889
test cost(l=1) = 7.466289574046468
test cost(l=3) = 11.643928179962794
test cost(l=10) = 27.715080206181412
最小的 test cost 对应的 λ = 0.3, cost = 7.336723122820889