机器学习算法中,有一个基础的算法,线性回归,它的目的是求出一条直线,满足所有点到这条直线的距离最短,也就是这些数据点能够看起来都在这条直线附近,最后,可以根据这条直线来预测其他数据的值。
线性回归,最推荐的做法其实是使用梯度下降算法,这种算法比较通用,对数据要求不高,可以离散不连续。如下所示,是一个使用梯度下降算法来进行线性回归的示例:
准备数据:
这两列数据最后是放在lineardata.csv中的,分别对应x,y集合。
1 3
1.2 3
1.2 4
1.5 4.5
1.6 4.3
6.5 12
3.6 7.1
2.5 9
5.7 14
6 11
9 17
8.9 17
7.1 15
7 14
2.5 4
0.8 2
0.5 2
3.4 7
3.6 9
5.6 12
6.7 15
6.9 15
7.1 14
7.5 17
7.8 16
8.1 15
8.3 15
8.5 15
8.7 16
8.7 17
8.8 18
8.8 20
8 16
9 19
9.2 18
10.1 20
1.1 3.2
1.6 4.2
4 9
12 25
9.5 20
程序代码:
import numpy as np
import matplotlib.pyplot as plt
def loss_error(w, b, data):
x = data[:, 0]
y = data[:, 1]
loss = np.sum((y - w * x - b) ** 2) / data.shape[0]
return loss
def linear_gradient(w, b, data, lr):
N = float(len(data))
x = data[:, 0]
y = data[:, 1]
dw = np.sum(-(2 / N) * x * (y - w * x - b))
db = np.sum(-(2 / N) * (y - w * x - b))
w = w - (lr * dw)
b = b - (lr * db)
return w, b
def optimizer(data, w, b, lr, epoch):
for i in range(epoch):
w, b = linear_gradient(w, b, data, lr)
if i % 100 == 0:
print('epoch {0}:loss={1}'.format(i, loss_error(w, b, data)))
return w, b
def plot_data(data, w, b):
x = data[:, 0]
y = data[:, 1]
y_predict = w * x + b
plt.plot(x, y, 'o')
plt.plot(x, y_predict, 'k-')
plt.show()
def linear_regression():
data = np.loadtxt('lineardata.csv', delimiter=',')
# data = pd.read_csv('lineardata.csv',encoding='utf8')
x = data[:, 0]
y = data[:, 1]
plt.plot(x, y, 'o')
#plt.show()
lr = 0.01
epoch = 1000
w = 0.0
b = 0.0
print("initial variables:\n initial_b={0}\n initial_w={1}\n loss={2}\n".format(b, w, loss_error(w, b, data)))
w, b = optimizer(data, w, b, lr, epoch)
print('final formula parameters:\n b = {0}\n w={1} \n loss={2}\n'.format(b, w, loss_error(w, b, data)))
plot_data(data, w, b)
if __name__ == '__main__':
linear_regression()
运行,打印结果:
initial variables:
initial_b=0.0
initial_w=0.0
loss=184.68365853658537
epoch 0:loss=3.265436338536489
epoch 100:loss=1.4187213286545117
epoch 200:loss=1.3652986742281288
epoch 300:loss