预测数值型数据：回归

最新推荐文章于 2022-08-13 22:13:31 发布

原创最新推荐文章于 2022-08-13 22:13:31 发布 · 328 阅读

0 ·

CC 4.0 BY-SA版权

机器学习专栏收录该内容

14 篇文章

订阅专栏

一、概述

1、普通最小二乘法

$\hat{w}=(X^{T}*X)^{-1}*X*y$ ，确保 $X^{T}*X$ 可逆

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    numFeat = len(open(fileName).readline().split('\t')) - 1      #get number of fields 
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr =[]
        curLine = line.strip().split('\t')
        for i in range(numFeat):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat

def standRegres(xArr,yArr):
    xMat = np.mat(xArr); yMat = np.mat(yArr).T
    xTx = xMat.T*xMat
    if np.linalg.det(xTx) == 0.0:                               #检验是否可逆
        print ("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T*yMat)
    return ws

'''
xArr,yArr = loadDataSet(path+'\\ex0.txt')

ws = standRegres(xArr,yArr)

xMat = np.mat(xArr)
yMat = np.mat(yArr)
yHat = xMat*ws

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
xCopy = xMat.copy()
xCopy.sort(0)
yHat1 = xCopy*ws
ax.plot(xCopy[:,1],yHat1)

2、局部加权线性回归(Locally Weighted Linear Regression, LWLR)

预测点附近的每个点赋予一定的权重，然后与线性回归类似，在这个子集上基于最小均方误差来进行普通的回归

$\hat{w}=(X^{T}*W*X)^{-1}*X*W*y$

缺陷：增加了计算量，因为它对每个点做预测时都必须使用整个数据集

'''
局部加权线性回归(Locally Weighted Linear Regression, LWLR)
'''
def lwlr(testPoint,xArr,yArr,k=1.0):
    xMat = np.mat(xArr); yMat = np.mat(yArr).T
    m = np.shape(xMat)[0]
    weights = np.mat(np.eye((m)))
    for j in range(m):                      #next 2 lines create weights matrix
        diffMat = testPoint - xMat[j,:]     #
        weights[j,j] = np.exp(diffMat*diffMat.T/(-2.0*k**2))
    xTx = xMat.T * (weights * xMat)
    if np.linalg.det(xTx) == 0.0:
        print ("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T * (weights * yMat))
    return testPoint * ws


def lwlrTest(testArr,xArr,yArr,k=1.0):  #loops over all the data points and applies lwlr to each one
    m = np.shape(testArr)[0]
    yHat = np.zeros(m)
    for i in range(m):
        yHat[i] = lwlr(testArr[i],xArr,yArr,k)
    return yHat


def lwlrTestPlot(xArr,yArr,k=1.0):  #same thing as lwlrTest except it sorts X first
    yHat1 = np.zeros(np.shape(yArr))       #easier for plotting
    xCopy = np.mat(xArr)
    xCopy.sort(0)
    for i in range(np.shape(xArr)[0]):
        yHat1[i] = lwlr(xCopy[i],xArr,yArr,k)
        
    import matplotlib.pyplot as plt
    fig = plt.figure()
    ax = fig.add_subplot(111)
    xMat = np.mat(xArr); yMat = np.mat(yArr).T
    ax.scatter(xMat[:,1].flatten().A[0],yMat[:,0].flatten().A[0])

    ax.plot(xCopy[:,1],yHat1)
    plt.title('k=%s'%(k))

'''
xArr,yArr = loadDataSet(path+'\\ex0.txt')
lwlrTestPlot(xArr,yArr,k=0.003)
'''

3、实例：预测鲍鱼的年龄

'''
结论：使用较小的核将得到较低的误差，但使用较小的核将造成过拟合，导致在测试集的误差过大
'''

4、岭回归(ridge regression)

'''
岭回归
'''
def ridgeRegres(xMat,yMat,lam=0.2):
    xTx = xMat.T*xMat
    denom = xTx + np.eye(np.shape(xMat)[1])*lam
    if np.linalg.det(denom) == 0.0:
        print ("This matrix is singular, cannot do inverse")
        return
    ws = denom.I * (xMat.T*yMat)
    return ws
    
def ridgeTest(xArr,yArr):
    xMat = np.mat(xArr); yMat=np.mat(yArr).T
    yMean = np.mean(yMat,0)
    yMat = yMat - yMean     #to eliminate X0 take mean off of Y
    #regularize X's
    xMeans = np.mean(xMat,0)   #calc mean then subtract it off
    xVar = np.var(xMat,0)      #calc variance of Xi then divide by it
    xMat = (xMat - xMeans)/xVar
    numTestPts = 30
    wMat = np.zeros((numTestPts,np.shape(xMat)[1]))
    for i in range(numTestPts):
        ws = ridgeRegres(xMat,yMat,np.exp(i-10))
        wMat[i,:]=ws.T
    return wMat

'''
abX,abY = loadDataSet(path+'\\abalone.txt')

ridgeWeights = ridgeTest(abX,abY)

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(ridgeWeights)
plt.show()
'''

5、套索方法(Lasso，The Least Absolute Shrinkage and Selection Operator)

'''
lasso     不会，理解不了
'''

6、前向逐步回归

前向逐步回归算法可以得到与 lasso 差不多的效果，但更加简单。它属于一种贪心算法，即每一步都尽可能减少误差

'''
数据标准化，使其分布满足0均值和单位方差
在每轮迭代过程中: 
    设置当前最小误差 lowestError 为正无穷
    对每个特征:
        增大或缩小:
            改变一个系数得到一个新的 w
            计算新 w 下的误差
            如果误差 Error 小于当前最小误差 lowestError: 设置 Wbest 等于当前的 W
        将W设置为新的 Wbest
'''
def regularize(xMat):                                            #数据标准化
    inMat = xMat.copy()
    inMeans = np.mean(inMat,0)   
    inVar = np.var(inMat,0)      
    inMat = (inMat - inMeans)/inVar
    return inMat

def stageWise(xArr,yArr,eps=0.01,numIt=1000):
    xMat = np.mat(xArr); yMat=np.mat(yArr).T
    yMean = np.mean(yMat,0)
    yMat = yMat - yMean     
    xMat = regularize(xMat)
    m,n=np.shape(xMat)
    ws = np.zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
    returnMat = np.zeros((numIt,n))
    
    for i in range(numIt):
        print (ws.T)
        lowestError = np.inf; 
        for j in range(n):
            for sign in [-1,1]:
                wsTest = ws.copy()
                wsTest[j] += eps*sign
                yTest = xMat*wsTest
                rssE = rssError(yMat.A,yTest.A)
                if rssE < lowestError:
                    lowestError = rssE
                    wsMax = wsTest
        ws = wsMax.copy()
        returnMat[i,:]=ws.T
        
    return returnMat

abX,abY = loadDataSet(path+'\\abalone.txt')
returnMat = stageWise(abX,abY)

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(returnMat)
plt.show()

'''
小结:
当应用缩减方法（如逐步线性回归或岭回归）时，模型也就增加了偏差（bias），
与此同时却减小了模型的方差。
'''