可视化与数据处理：线性回归与正则化探索-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_45055763/article/details/126083860

可视化与加载数据

import numpy as np
import scipy.io as sio
from scipy.io import loadmat
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

d=loadmat('ex5data1.mat')

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Fri Nov  4 22:27:26 2011',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[-15.93675813],
        [-29.15297922],
        [ 36.18954863],
        [ 37.49218733],
        [-48.05882945],
        [ -8.94145794],
        [ 15.30779289],
        [-34.70626581],
        [  1.38915437],
        [-44.38375985],
        [  7.01350208],
        [ 22.76274892]]),
 'y': array([[ 2.13431051],
        [ 1.17325668],
        [34.35910918],
        [36.83795516],
        [ 2.80896507],
        [ 2.12107248],
        [14.71026831],
        [ 2.61418439],
        [ 3.74017167],
        [ 3.73169131],
        [ 7.62765885],
        [22.7524283 ]]),
 'Xtest': array([[-33.31800399],
        [-37.91216403],
        [-51.20693795],
        [ -6.13259585],
        [ 21.26118327],
        [-40.31952949],
        [-14.54153167],
        [ 32.55976024],
        [ 13.39343255],
        [ 44.20988595],
        [ -1.14267768],
        [-12.76686065],
        [ 34.05450539],
        [ 39.22350028],
        [  1.97449674],
        [ 29.6217551 ],
        [-23.66962971],
        [ -9.01180139],
        [-55.94057091],
        [-35.70859752],
        [  9.51020533]]),
 'ytest': array([[ 3.31688953],
        [ 5.39768952],
        [ 0.13042984],
        [ 6.1925982 ],
        [17.08848712],
        [ 0.79950805],
        [ 2.82479183],
        [28.62123334],
        [17.04639081],
        [55.38437334],
        [ 4.07936733],
        [ 8.27039793],
        [31.32355102],
        [39.15906103],
        [ 8.08727989],
        [24.11134389],
        [ 2.4773548 ],
        [ 6.56606472],
        [ 6.0380888 ],
        [ 4.69273956],
        [10.83004606]]),
 'Xval': array([[-16.74653578],
        [-14.57747075],
        [ 34.51575866],
        [-47.01007574],
        [ 36.97511905],
        [-40.68611002],
        [ -4.47201098],
        [ 26.53363489],
        [-42.7976831 ],
        [ 25.37409938],
        [-31.10955398],
        [ 27.31176864],
        [ -3.26386201],
        [ -1.81827649],
        [-40.7196624 ],
        [-50.01324365],
        [-17.41177155],
        [  3.5881937 ],
        [  7.08548026],
        [ 46.28236902],
        [ 14.61228909]]),
 'yval': array([[ 4.17020201e+00],
        [ 4.06726280e+00],
        [ 3.18730676e+01],
        [ 1.06236562e+01],
        [ 3.18360213e+01],
        [ 4.95936972e+00],
        [ 4.45159880e+00],
        [ 2.22763185e+01],
        [-4.38738274e-05],
        [ 2.05038016e+01],
        [ 3.85834476e+00],
        [ 1.93650529e+01],
        [ 4.88376281e+00],
        [ 1.10971588e+01],
        [ 7.46170827e+00],
        [ 1.47693464e+00],
        [ 2.71916388e+00],
        [ 1.09269007e+01],
        [ 8.34871235e+00],
        [ 5.27819280e+01],
        [ 1.33573396e+01]])}

数据包括：

training set:X,y
cross validation: Xval,yval
test set:Xtest,ytest

X, y, Xval, yval, Xtest, ytest = map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])

df = pd.DataFrame({'water_level':X, 'flow':y})
df.head()

	water_level	flow
0	-15.936758	2.134311
1	-29.152979	1.173257
2	36.189549	34.359109
3	37.492187	36.837955
4	-48.058829	2.808965

#可视化
sns.lmplot('water_level','flow',data=df,fit_reg=False)
plt.show()

E:\Anaconda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

在这里插入图片描述

Regularized linear regression cost function

X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]

def cost(theta,X,y):
    #input:参数值theta,数据X，标签y
    #output：代价函数
    
    #样本个数
    m=X.shape[0]
    
    #计算代价函数
    inner=X@theta-y
    square_sum=inner.T@inner
    cost=square_sum/(2*m)
    
    return cost

theta = np.ones(X.shape[1])
cost(theta, X, y)

303.9515255535976

#正则化代价函数
def regularized_cost(theta,X,y,l=1):
    m=X.shape[0]
    
    regularized_term=(1/(2*m))*np.power(theta[1:],2).sum()
    return cost(theta,X,y)+regularized_term

Regularized linear regression gradient

def gradient(theta,X,y):
    #input:参数值theta，数据X，标签y
    #output：当前参数下梯度
    
    #样本个数
    m=X.shape[0]
    
    #计算代价函数
    grad=(X.T@(X@theta-y))/m
    
    return grad

gradient(theta, X, y)

array([-15.30301567, 598.16741084])

def regularized_gradient(theta, X, y, l=1):
    
    m=X.shape[0]
    
    regularized_term=theta.copy()
    regularized_term[0]=0#theta0不正则化
    
    regularized_term=(1/m)*regularized_term
    
    return gradient(theta, X, y) + regularized_term

regularized_gradient(theta, X, y)

array([-15.30301567, 598.25074417])

拟合数据

def linear_regression_np(X, y, l=1):

    # STEP1：初始化参数
    theta = np.ones(X.shape[1])
    
    # STEP2：调用优化算法拟合参数
    # your code here  (appro ~ 1 lines)
    res = opt.minimize(fun=regularized_cost,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=regularized_gradient,
                       options={'disp': True})
    return res

theta = np.ones(X.shape[0])

final_theta = linear_regression_np(X, y, l=0).get('x')

b=final_theta[0]
k=final_theta[1]

plt.scatter(X[:,1],y,label='training data',c='r')
plt.plot(X[:,1],X[:,1]*k+b,label='prediction')
plt.legend(loc=2)
plt.show()

在这里插入图片描述

bias-variance

training_cost, cv_cost = [], []

m = X.shape[0]
for i in range(1, m+1):
    # 计算当前样本的代价
    res = linear_regression_np(X[:i, :], y[:i], l=0)
    
    tc = regularized_cost(res.x, X[:i, :], y[:i], l=0)
    cv = regularized_cost(res.x, Xval, yval, l=0)
    
    # 把计算结果存储至预先定义的数组training_cost, cv_cost中
    training_cost.append(tc)
    cv_cost.append(cv)

plt.plot(np.arange(1,m+1),training_cost,label='training cost')
plt.plot(np.arange(1,m+1),cv_cost,label='cv cost')
plt.legend(loc=1)
plt.show()

在这里插入图片描述

多项式回归

#特征映射
def poly_features(x, power, as_ndarray=False):  #特征映射
    data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
    df = pd.DataFrame(data)

    return df.values if as_ndarray else df

def normalize_feature(df):
    """Applies function along input axis(default 0) of DataFrame."""
    return df.apply(lambda column: (column - column.mean()) / column.std())

def prepare_poly_data(*args, power):
    """
    args: keep feeding in X, Xval, or Xtest
        will return in the same order
    """
    def prepare(x):
        # 特征映射
        df = poly_features(x, power=power)

        # 归一化处理
        ndarr = normalize_feature(df).values

        # 添加偏置项
        return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)

    return [prepare(x) for x in args]

X, y, Xval, yval, Xtest, ytest = map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])

poly_features(X, power=3)

	f1	f2	f3
0	-15.936758	253.980260	-4047.621971
1	-29.152979	849.896197	-24777.006175
2	36.189549	1309.683430	47396.852168
3	37.492187	1405.664111	52701.422173
4	-48.058829	2309.651088	-110999.127750
5	-8.941458	79.949670	-714.866612
6	15.307793	234.328523	3587.052500
7	-34.706266	1204.524887	-41804.560890
8	1.389154	1.929750	2.680720
9	-44.383760	1969.918139	-87432.373590
10	7.013502	49.189211	344.988637
11	22.762749	518.142738	11794.353058

画出线性回归

X_poly, Xval_poly, Xtest_poly= prepare_poly_data(X, Xval, Xtest, power=8)

def plot_learning_curve(X, y, Xval, yval, l=0):
# INPUT：训练数据集X,y，交叉验证集Xval，yval，正则化参数l
# OUTPUT：当前参数值下梯度
# TODO：根据参数和输入的数据计算梯度 
    
    # STEP1：初始化参数，获取样本个数，开始遍历
    training_cost, cv_cost = [], []
    m = X.shape[0]
    for i in range(1, m + 1):
        # STEP2：调用之前写好的拟合数据函数进行数据拟合
        # your code here  (appro ~ 1 lines)
        res = linear_regression_np(X[:i, :], y[:i], l=l)
        # STEP3：计算样本代价
        # your code here  (appro ~ 1 lines)
        tc = cost(res.x, X[:i, :], y[:i])
        cv = cost(res.x, Xval, yval)
        # STEP3：把计算结果存储至预先定义的数组training_cost, cv_cost中
        # your code here  (appro ~ 2 lines)
        training_cost.append(tc)
        cv_cost.append(cv)

        

    plt.plot(np.arange(1, m + 1), training_cost, label='training cost')
    plt.plot(np.arange(1, m + 1), cv_cost, label='cv cost')
    plt.legend(loc=1)

plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)
plt.show()

在这里插入图片描述

吴恩达作业ex5：Regularized Linear Regression and Bias v.s. Variance

可视化与加载数据

Regularized linear regression cost function

Regularized linear regression gradient

拟合数据

bias-variance

多项式回归

画出线性回归