from PIL import Image
from numpy import *
import copy
import pandas as pd
import matplotlib.pyplot as plt
Machine Learning,Part 1 - Linear Regression
Linear Regression
- 优点: 结果易理解,计算不复杂
- 缺点: 对非线性数据拟合不好
- 适用数据类型: 数值型和标称型
Normal Equations
在复旦旁听,觉得邬老师的向量投影
的方法进行推演,确实精彩,我直接将PPT贴在下面好了。
局部加权线性回归(LWLR)
LWLR 的提出的目的是处理线性回归的欠拟合问题,所以对于预测点,原数据中的点每个都对他有一个权值影响,越接近,权重越大。这里使用的是高斯核
w(i,i)=exp((x(i)−x)2−2k2)
LWLR 唯一需要控制的参数为k,k越大,表示各方面的影响力的覆盖情况和影响程度。下面附上影响图:(假设预测的点为0.5)
def guesskearn(x,k=0.5,x0=0.5):
return exp((x-0.5)**2 / (-2.0 * k**2))
fig = plt.figure()
x = linspace(-0.2,1.2,200,endpoint=True)
y1 = guesskearn(x)
y2 = guesskearn(x,0.1)
y3 = guesskearn(x,0.01)
plt.plot(x,y1,label="k=0.5")
plt.plot(x,y2,label="k=0.1")
plt.plot(x,y3,label="k=0.01")
plt.legend(loc='upper left')
plt.show()
xArr,yArr = loadDataSet('ex0.txt')
ws = standRegres(xArr,yArr)
print(ws)
# [[ 3.00774324]
# [ 1.69532264]]
xMat,yMat = mat(xArr), mat(yArr)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
xCopy = xMat.copy()
xCopy.sort(0)
yHat = xCopy * ws
ax.plot(xCopy[:,1], yHat)
plt.show()
yHat = xMat * ws
# 通过yHat与实际值y的相关系数评估匹配程度
corrcoef(yHat.T, yMat)
'''
Out[157]:
array([[ 1. , 0.98647356],
[ 0.98647356, 1. ]])
'''
yHat = LWLRTest(xArr, xArr, yArr, 0.01)
xMat = mat(xArr)
srtInd = xMat[:,1].argsort(0)
xSort = xMat[strInd][:,0,:]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xSort[:,1],yHat[srtInd])
ax.scatter(xMat[:,1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2)
plt.show()
案例分析: 预测鲍鱼的年龄
这个案例及代码主要是依据Peter Harrington的《机器学习实战》的例子,对每一步进行一个分析。
df = pd.read_table('abalone.txt', header=None)
print(df.head())
'''
0 1 2 3 4 5 6 7 8
0 1 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
1 1 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
2 -1 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
3 1 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
4 0 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7
'''
abX, abY = loadDataSet('balone.txt')
yHat01 = LWLRTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = LWLRTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = LWLRTest(abX[0:99], abX[0:99], abY[0:99], 10)
print("----------训练集上的对比误差----------")
print(" k=0.1的误差为:" + str(rssError(abY[0:99], yHat01.T)))
print(" k=1的误差为:" + str(rssError(abY[0:99], yHat1.T)))
print(" k=10的误差为:" + str(rssError(abY[0:99], yHat10.T)))
print("-----------------------------------")
'''
----------训练集上的对比误差----------
k=0.1的误差为:56.8253937728
k=1的误差为:429.89056187
k=10的误差为:549.118170883
-----------------------------------
'''
# 验证集合
yHat01 = LWLRTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yHat1 = LWLRTest(abX[100:199], abX[0:99], abY[0:99], 1)
yHat10 = LWLRTest(abX[100:199], abX[0:99], abY[0:99], 10)
print("----------测试集上的对比误差----------")
print(" k=0.1的误差为:" + str(rssError(abY[100:199], yHat01.T)))
print(" k=1的误差为:" + str(rssError(abY[100:199], yHat1.T)))
print(" k=10的误差为:" + str(rssError(abY[100:199], yHat10.T)))
print("-----------------------------------")
'''
----------测试集上的对比误差----------
k=0.1的误差为:64688.8082354
k=1的误差为:573.52614419
k=10的误差为:517.571190538
-----------------------------------
'''
# 对比简单的LR
ws = standRegres(abX[0:99],abY[0:99])
yHat = mat(abX[100:199]) * ws
print("LR的误差为:" + str(rssError(abY[100:199], yHat.T.A)))
# LR的误差为:518.636315324
岭回归
岭回归是在xTx上加一个λI从而使矩阵变得非奇异。 岭回归与Regularization本质是相同的,所以也可以用来防止过拟合.
加了Ridge Regression 的 Normal Equation 为:
θ=(XTX+λI)−1XTy
# 观察鲍鱼仔不同的lambda中的回归效果 不同的颜色表示的是不同的theta值,即theta0,...,theta7
ridgeWeights = ridgeTest(abX,abY)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.xlabel("log(lambda)")
plt.show()
前向逐步回归
本质是满足约束条件∑nk=1|wk|<=λ
前向逐步回归是一种贪心算法 —— 每一步都尽可能减少误差。
stageWeights = stageWise(abX,abY,0.001,5000)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(stageWeights)
plt.xlabel("Time Of Iteration")
plt.ylabel("theta value")
plt.show()
print(stageWeights[-1,:])
# [ 0.043 -0.011 0.12 0.022 2.023 -0.963 -0.105 0.187]
主要代码部分:
from numpy import *
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split('\t')) - 1
dataMat,labelMat =[],[]
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
def standRegres(xArr,yArr):
xMat,yMat = mat(xArr), mat(yArr).T
xTx = xMat.T * xMat
if linalg.det(xTx) == 0.0:
print("该矩阵奇异,没有逆")
return
ws = xTx.I * (xMat.T * yMat)
# ws = linalg.solve(xTx, xMat.T * yMat)
return ws
局部加权线性回归
def LWLR(testPoint,xArr,yArr,k=1.0):
xMat, yMat = mat(xArr), mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye(m))
for j in range(m):
diffMat = testPoint - xMat[j,:]
weights[j,j] = exp(diffMat * diffMat.T / (-2.0 * k**2))
xTx = xMat.T * weights * xMat
if linalg.det(xTx) == 0.0:
print("该矩阵奇异,无逆")
return
ws = xTx.I * xMat.T * weights * yMat
return testPoint * ws
def LWLRTest(testArr,xArr,yArr,k=1.0):
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] = LWLR(testArr[i],xArr,yArr,k)
return yHat
def rssError(yArr,yHatArr):
return ((yArr - yHatArr)**2).sum()
岭回归
def ridgeRegres(xMat,yMat,Lamd=0.2):
xTx = xMat.T * xMat
denom = xTx + eye(shape(xMat)[1])*Lamd
if linalg.det(denom) == 0.0:
print("该矩阵奇异,无逆矩阵")
return
ws = denom.I * xMat.T * yMat
return ws
测试lambda的值
def ridgeTest(xArr,yArr):
xMat,yMat = mat(xArr),mat(yArr).T
# 数据标准化
yMean = mean(yMat,0)
yMat = yMat - yMean
xMean = mean(xMat,0)
xVar = var(xMat,0)
xMat = (xMat - xMean)/xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1])) # 每一行就是一个lambda对应的theta
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:] = ws.T
return wMat
前向逐步线性回归
def regularize(xMat):
inMat = xMat.copy()
inMeans = mean(inMat,0)
inVar = var(inMat,0)
inMat = (inMat - inMeans)/inVar
return inMat
def stageWise(xArr,yArr,eps=0.01,numIt=100):
xMat,yMat = mat(xArr),mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean
xMat = regularize(xMat)
m,n = shape(xMat)
returnMat = zeros((numIt,n))
ws = zeros((n,1))
wsTest,wsMax = ws.copy(), ws.copy()
for i in range(numIt):
# print(ws.T)
lowestError = inf;
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:] = ws.T
return returnMat
交叉验证测试岭回归
def crossVaildation(xArr, yArr, numVal=10):
m = len(yArr)
indexList = range(m)
errorMat = zeros((numVal, 30))
for i in range(numVal):
trainX, trainY, testX, testY= [], [], [], []
random.shuffle(indexList)
for j in range(m):
if j < 0.9 * m:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX, trainY)
for k in range(30):
matTestX = mat(testX)
matTrainX = mat(trainX)
meanTrain = mean(matTrainX, 0)
varTrain = var(matTrainX, 0)
matTestX = (matTestX - meanTrain)/varTrain
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)
errorMat[i,k] = rssError(yEst.T.A, array(testY))
meanErrors = mean(errorMat, 0)
minMean = float(min(meanErrors))
bestWeights = wMat[nonzero(meanErrors == minMean)]
xMat = mat(xArr)
yMat = mat(yArr).T
meanX = mean(xMat, 0)
varX = var(xMat, 0)
unReg = bestWeights/varX
print("最好的模型为:" + unReg)
print("常数项为: ",-1*sum(multiply(meanX,unReg)) + mean(yMat))