本质上线性回归实现的是
l
a
b
e
l
(
i
)
=
w
1
x
1
(
i
)
+
w
2
x
2
(
i
)
+
.
.
.
+
w
p
x
p
(
i
)
+
b
label^{(i)}=w_1x_1^{(i)}+w_2x_2^{(i)}+...+w_px_p^{(i)}+b
label(i)=w1x1(i)+w2x2(i)+...+wpxp(i)+b
也即
y
=
w
X
+
b
y=wX+b
y=wX+b
公式推导见另一篇博客:
线性回归系数,局部加权线性回归系数的数学推导
Linear Regression
计算 ws(即 w ^ \hat{w} w^ ),拿w和X拟合即可。
from numpy import *
import matplotlib.pyplot as plt
# the same code as before
def local_dataset(filename):
num_feat = len(open(filename).readline().split('\t')) - 1
datamat = []
labelmat = []
fr = open(filename, 'r')
for line in fr.readlines():
line_arr = []
cur_line = line.strip().split('\t')
for i in range(num_feat):
line_arr.append(float(cur_line[i]))
datamat.append(line_arr)
labelmat.append(float(cur_line[-1]))
return datamat, labelmat
# traditional linear regression
def stand_regression(x_arr, y_arr):
x_mat = mat(x_arr)
y_mat = mat(y_arr).T
n = shape(x_mat)[0]
xTx = x_mat.T * x_mat
if linalg.det(xTx) == 0.0:
print('This matrix is singular,cannot do inverse')
return
ws = xTx.I * (x_mat.T * y_mat) # w estimataion
return ws
if __name__ == '__main__':
x_arr, y_arr = local_dataset('ex0.txt')
ws = stand_regression(x_arr, y_arr)
x_mat = mat(x_arr)
y_mat = mat(y_arr)
y_hat = x_mat * ws
# plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(x_mat[:, 1].flatten().A[0], y_mat.T[:, 0].flatten().A[0])
'''
ax.plot(x_mat[:,1],y_hat)
plt.show()
'''
x_copy=x_mat.copy()
x_copy.sort(0)
y_hat=x_copy*ws
ax.plot(x_copy[:, 1],y_hat)
plt.show()
Local Weighted Linear Regression
这里用了一个核函数给每个数据点加权,高斯核根据距离来决定权重,理解比较直观,直接对数据产生局部加权的作用。和SVM的核函数相比,SVM的核函数直接作用在原数据,从一个特征空间到另一个特征空间。简单理解从2维到3维, 分类超平面从直线到一个面。
局部加权线性回归,给每个数据点x_i
加权,由于用的是高斯核,每次计算x_i
时其他行和该行比较,再加一个权重,因此计算y_i
的估计值 yhat
不同于线性的做法直接得出,而是分部计算每个yhat[i]
from numpy import *
import matplotlib.pyplot as plt
# the same code as before
def local_dataset(filename):
num_feat = len(open(filename).readline().split('\t')) - 1
datamat = []
labelmat = []
fr = open(filename, 'r')
for line in fr.readlines():
line_arr = []
cur_line = line.strip().split('\t')
for i in range(num_feat):
line_arr.append(float(cur_line[i]))
datamat.append(line_arr)
labelmat.append(float(cur_line[-1]))
return datamat, labelmat
# traditional linear regression
def stand_regression(x_arr, y_arr):
x_mat = mat(x_arr)
y_mat = mat(y_arr).T
n = shape(x_mat)[0]
xTx = x_mat.T * x_mat
if linalg.det(xTx) == 0.0:
print('This matrix is singular,cannot do inverse')
return
ws = xTx.I * (x_mat.T * y_mat) # w estimataion
return ws
# calc each line's weights
def lwlr_ws(cur_point, x_arr, y_arr, k=1.0):
x_mat = mat(x_arr)
y_mat = mat(y_arr).T
n = shape(x_mat)[0]
weights = mat(eye(n))
for i in range(n):
diffmat = cur_point - x_mat[i, :]
weights[i, i] = exp(diffmat * diffmat.T / (-2.0 * k ** 2))
xTx = x_mat.T * (weights * x_mat)
if linalg.det(xTx) == 0.0:
print('This matrix is singular,cannot do inverse')
return
ws = xTx.I * (x_mat.T * weights * y_mat)
return ws
# calc each yhat[i]
def lwlr_yhat(all_point, x_arr, y_arr, k=1.0): # loops over all the data points an applies lwlr to each one
n = shape(all_point)[0]
yhat = zeros(n)
for i in range(n):
yhat[i] = all_point[i] * lwlr_ws(all_point[i], x_arr, y_arr, k)
return yhat
# plt
def lwlr_plot():
x_arr, y_arr = local_dataset('ex0.txt')
x_mat = mat(x_arr)
y_mat = mat(y_arr)
'''
下面两行是非常坑的,用IDLE3跑了明白具体过程
sort_index返回的是排序后的原来下标,而且是按照矩阵排序 举个例子
a=[[1,2],[3,4]],排序后 a_sort=a.argsort(0)
>>>[[2,2],[1,1]]
若为a_sort=a[:,1].argsort(0)
>>>[[2],[1]]
最终a[a_sort]里面套了两层,加上自己一层是三层,因此三个下标
'''
sort_index = x_mat[:, 1].argsort(0)
x_sort = x_mat[sort_index][:, 0, :]
yhat = lwlr_yhat(x_mat, x_arr, y_arr, k=0.01)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(x_mat[:, 1].flatten().A[0], y_mat.T[:, 0].flatten().A[0], s=2, c='red')
ax.plot(x_sort[:, 1], yhat[sort_index])
fig.show()
if __name__ == '__main__':
lwlr_plot()