一、概述
1、普通最小二乘法
, 确保
可逆
def loadDataSet(fileName): #general function to parse tab -delimited floats
numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
def standRegres(xArr,yArr):
xMat = np.mat(xArr); yMat = np.mat(yArr).T
xTx = xMat.T*xMat
if np.linalg.det(xTx) == 0.0: #检验是否可逆
print ("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T*yMat)
return ws
'''
xArr,yArr = loadDataSet(path+'\\ex0.txt')
ws = standRegres(xArr,yArr)
xMat = np.mat(xArr)
yMat = np.mat(yArr)
yHat = xMat*ws
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
xCopy = xMat.copy()
xCopy.sort(0)
yHat1 = xCopy*ws
ax.plot(xCopy[:,1],yHat1)
2、局部加权线性回归(Locally Weighted Linear Regression, LWLR)
预测点附近的每个点赋予一定的权重,然后与线性回归类似,在这个子集上基于最小均方误差来进行普通的回归
缺陷:增加了计算量,因为它对每个点做预测时都必须使用整个数据集
'''
局部加权线性回归(Locally Weighted Linear Regression, LWLR)
'''
def lwlr(testPoint,xArr,yArr,k=1.0):
xMat = np.mat(xArr); yMat = np.mat(yArr).T
m = np.shape(xMat)[0]
weights = np.mat(np.eye((m)))
for j in range(m): #next 2 lines create weights matrix
diffMat = testPoint - xMat[j,:] #
weights[j,j] = np.exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights * xMat)
if np.linalg.det(xTx) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
m = np.shape(testArr)[0]
yHat = np.zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
return yHat
def lwlrTestPlot(xArr,yArr,k=1.0): #same thing as lwlrTest except it sorts X first
yHat1 = np.zeros(np.shape(yArr)) #easier for plotting
xCopy = np.mat(xArr)
xCopy.sort(0)
for i in range(np.shape(xArr)[0]):
yHat1[i] = lwlr(xCopy[i],xArr,yArr,k)
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
xMat = np.mat(xArr); yMat = np.mat(yArr).T
ax.scatter(xMat[:,1].flatten().A[0],yMat[:,0].flatten().A[0])
ax.plot(xCopy[:,1],yHat1)
plt.title('k=%s'%(k))
'''
xArr,yArr = loadDataSet(path+'\\ex0.txt')
lwlrTestPlot(xArr,yArr,k=0.003)
'''
3、实例:预测鲍鱼的年龄
'''
结论:使用较小的核将得到较低的误差,但使用较小的核将造成过拟合,导致在测试集的误差过大
'''
4、 岭回归(ridge regression)
'''
岭回归
'''
def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + np.eye(np.shape(xMat)[1])*lam
if np.linalg.det(denom) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = denom.I * (xMat.T*yMat)
return ws
def ridgeTest(xArr,yArr):
xMat = np.mat(xArr); yMat=np.mat(yArr).T
yMean = np.mean(yMat,0)
yMat = yMat - yMean #to eliminate X0 take mean off of Y
#regularize X's
xMeans = np.mean(xMat,0) #calc mean then subtract it off
xVar = np.var(xMat,0) #calc variance of Xi then divide by it
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = np.zeros((numTestPts,np.shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,np.exp(i-10))
wMat[i,:]=ws.T
return wMat
'''
abX,abY = loadDataSet(path+'\\abalone.txt')
ridgeWeights = ridgeTest(abX,abY)
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.show()
'''
5、套索方法(Lasso,The Least Absolute Shrinkage and Selection Operator)
'''
lasso 不会,理解不了
'''
6、前向逐步回归
前向逐步回归算法可以得到与 lasso 差不多的效果,但更加简单。它属于一种贪心算法,即每一步都尽可能减少误差
'''
数据标准化,使其分布满足0均值和单位方差
在每轮迭代过程中:
设置当前最小误差 lowestError 为正无穷
对每个特征:
增大或缩小:
改变一个系数得到一个新的 w
计算新 w 下的误差
如果误差 Error 小于当前最小误差 lowestError: 设置 Wbest 等于当前的 W
将W设置为新的 Wbest
'''
def regularize(xMat): #数据标准化
inMat = xMat.copy()
inMeans = np.mean(inMat,0)
inVar = np.var(inMat,0)
inMat = (inMat - inMeans)/inVar
return inMat
def stageWise(xArr,yArr,eps=0.01,numIt=1000):
xMat = np.mat(xArr); yMat=np.mat(yArr).T
yMean = np.mean(yMat,0)
yMat = yMat - yMean
xMat = regularize(xMat)
m,n=np.shape(xMat)
ws = np.zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
returnMat = np.zeros((numIt,n))
for i in range(numIt):
print (ws.T)
lowestError = np.inf;
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:]=ws.T
return returnMat
abX,abY = loadDataSet(path+'\\abalone.txt')
returnMat = stageWise(abX,abY)
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(returnMat)
plt.show()
'''
小结:
当应用缩减方法(如逐步线性回归或岭回归)时,模型也就增加了偏差(bias),
与此同时却减小了模型的方差。
'''