实现
通过训练样本(train.csv)得出的m,b值,来计算测试集(test.csv)中的y的预测值,并保存结果(result.csv)
代码
import numpy as np
import pylab
import csv
from sklearn.metrics import *
#计算误差
def compute_error(b, m, data):
'''
损失函数:计算实际值与测试值误差
m: 权重
b: 偏移
'''
totalError = 0
x = data[:, 0]
y = data[:, 1]
#计算实际值与预测值之间差的平方的和
totalError = (y-m*x - b)**2
totalError = np.sum(totalError, axis=0)
#返回损失值
return totalError/float(len(data))
#梯度下降
def optimizer(data, starting_b, starting_m, learning_rate, num_iter):
'''
优化器用于进行梯度下降迭代
starting_b: 初始偏移
starting_m: 初始权重
learning_rate: 学习率
num_iter: 迭代次数
'''
b = starting_b
m = starting_m
# gradient descent, 迭代次数
for i in range(num_iter):
# update b and m with the new more accurate b and m by performing
# thie gradient step
b, m = compute_gradient(b, m, data, learning_rate)
if i % 100 == 0:
print('iter {0}:error={1}'.format(i, compute_error(b, m, data)))
return [b, m]
#梯度下降更新w:权重,b:偏移
def compute_gradient(b_current, m_current, data, learning_rate):
'''
b_current: 当前偏移
m_current: 当前权重
learning_rate: 学习率
'''
b_gradient = 0
m_gradient = 0
N = float(len(data))
# Vectorization implementation
x = data[:, 0]
y = data[:, 1]
b_gradient = -(2/N)*(y-m_current*x-b_current) # b求偏导
b_gradient = np.sum(b_gradient, axis=0)
m_gradient = -(2/N)*x*(y-m_current*x-b_current)# m求偏导
m_gradient = np.sum(m_gradient,axis=0)
# update our b and m values using out partial derivations
new_b = b_current - (learning_rate * b_gradient)# 更新偏移
new_m = m_current - (learning_rate * m_gradient)# 更新权重
return [new_b, new_m]
#画出线性回归模型
def plot_data(data, b, m):
'''
可视化数据
'''
x = data[:, 0]
y = data[:, 1]
y_predict = m*x + b
pylab.plot(x, y, 'o')
pylab.plot(x, y_predict, 'k-')
pylab.show()
def Linear_regression():
# get train data
data = np.loadtxt('train.csv', delimiter=',')
test_data = np.loadtxt("test.csv",delimiter=',')
# define hyperparamters
# learning_rate is used for update gradient
# defint the number that will iteration
'''
选择合适的参数
'''
learning_rate = 0.001
initial_b = 0.0
initial_m = 0.0
num_iter = 1000
# train model
#print b m error
print('initial variables:\n initial_b = {0}\n intial_m = {1}\n error of begin = {2} \n'
.format(initial_b, initial_m, compute_error(initial_b, initial_m, data)))
# optimizing b and m 进行梯度下降,寻找最优
[b, m] = optimizer(data, initial_b, initial_m, learning_rate, num_iter)
# print final b m error
print('final formula parmaters:\n b = {1}\n m={2}\n error of end = {3} \n'
.format(num_iter, b, m, compute_error(b, m, data)))
# plot result
plot_data(data, b, m)
# 根据测试集x,计算出测试集的预测y值
y_test_predict = list(test_data[:, ] * m + b)
save_to_csv(y_test_predict, "result.csv")
def save_to_csv(data, outpath):
file = open(outpath, 'w', newline='')
csv_write = csv.writer(file)
csv_write.writerow(['result'])
for i in range(len(data)):
csv_write.writerow([data[i]])
file.close()
return print("结果保存成功")
if __name__ == '__main__':
Linear_regression()
模型评估
可能做得不对,只是个人尝试,代码接上面的
def Linear_regression():
# get train data
data = np.loadtxt('train.csv', delimiter=',')
test_data = np.loadtxt("test.csv",delimiter=',')
# define hyperparamters
# learning_rate is used for update gradient
# defint the number that will iteration
'''
选择合适的参数
'''
learning_rate = 0.001
initial_b = 0.0
initial_m = 0.0
num_iter = 1000
# train model
#print b m error
print('initial variables:\n initial_b = {0}\n intial_m = {1}\n error of begin = {2} \n'
.format(initial_b, initial_m, compute_error(initial_b, initial_m, data)))
# optimizing b and m 进行梯度下降,寻找最优
[b, m] = optimizer(data, initial_b, initial_m, learning_rate, num_iter)
# print final b m error
print('final formula parmaters:\n b = {1}\n m={2}\n error of end = {3} \n'
.format(num_iter, b, m, compute_error(b, m, data)))
# plot result
plot_data(data, b, m)
#训练样本预测的y值
y_predict = list(data[:, 0] * m + b)
#根据测试集x,计算出测试集的预测y值
y_test_predict = list(test_data[:, ] * m + b)
save_to_csv(y_test_predict, "result.csv")
#根据训练样本预测的y值与训练样本y真实值,来预判模型准确率
#valid(y_predict, list(data[:, 1]),)
#误差大小预测模型准确率
valid(y_predict, list(data[:, 1]), compute_error(b, m, data))
def save_to_csv(data, outpath):
file = open(outpath, 'w', newline='')
csv_write = csv.writer(file)
csv_write.writerow(['result'])
for i in range(len(data)):
csv_write.writerow([data[i]])
file.close()
return print("结果保存成功")
'''
#平均绝对误差和均方误差计算准确率
def valid(y, y_true):
#accuracy = mean_squared_error(y_true, y)
accuracy = mean_absolute_error(y_true, y) * 100
print("模型正确率:{0}%".format(accuracy))
'''
#误差大小计算准确率
def valid(y,y_true, error):
print(error)
n = 0
length = len(y)
for i, j in zip(y, y_true):
if i-error <= j <= i+error:
n += 1
accuracy = (n/length) * 100
print("模型正确率:{0}%".format(accuracy))
if __name__ == '__main__':
Linear_regression()
数据
自行保存在csv文件中
测试集(test.csv)
5.5
8.9
2.23
4.4
22
训练样本(train.csv)
1,3
1.2,3
1.2,4
1.5,4.5
1.6,4.3
6.5,12
3.6,7.1
2.5,9
5.7,14
6,11
9,17
8.9,17
7.1,15
7,14
2.5,4
0.8,2
0.5,2
3.4,7
3.6,9
5.6,12
6.7,15
6.9,15
7.1,14
7.5,17
7.8,16
8.1,15
8.3,15
8.5,15
8.7,16
8.7,17
8.8,18
8.8,20
8,16
9,19
9.2,18
10.1,20
1.1,3.2
1.6,4.2
4,9
12,25
9.5,20
参考
机器学习之线性回归及python实现
七、回归——回归预测的评价指标(附python代码)
python sklearn 线性回归模型 模型评估
Python 线性回归分析以及评价指标