https://github.com/maplezzz/NTU_ML2017_Hung-yi-Lee_HW/tree/master/HW1
题目要求:
预测丰原地区一段时间内的PM2.5值,
- train.csv:每个月前20天每个小时的气象资料(每小时有18种测资)共12个月
- test.csv: 排除train.csv剩余的资料 取连续9小时的资料当作feature 预测第10小时的PM2.5值 总共240笔
- ans.csv test的answer
线性预测方法:
- Gradient Descent:
- Adagrad:Divide the learn rate of each parameters by the root mean square of its previous derivatives.
- SGD(Stochastic Gradient Descent):Update for each example.
解题过程:
一、数据处理:
1)train.csv:
导入到data列表(training data)中,纵轴为18种物质,横轴为时间:24小时*20天*12个月=5760小时。
#read data,18种物质*每天每个时刻
n_row = 0
text = open('data/train.csv', 'r', encoding='big5')
#GB码与BIG5是中国人常用的两种编码集。GB码为大陆使用,BIG5为香港与台湾使用。
row = csv.reader(text, delimiter=',') #将每行读取的值作为列表返回
for r in row: #读取每一行
if n_row != 0: #从原表格中第一行开始读取
for i in range(3,27): #每一个数据项
if r[i] != "NR":
data[(n_row-1)%18].append(float(r[i])) #向data中添加数据
else:
data[(n_row-1)%18].append(float(0))
n_row = n_row + 1
text.close
将training data分为x和y,x是前九个小时的数据,y是第10个小时的PM2.5含量。注意到时间轴是每个月的前二十天,所以不能连续地分组,即有些时刻不能用于预测。比如x的时间是[470-478],预测第479小时的PM2.5含量,那么下一组x应该是[480-488],而不是[471-479]。用双重for循环:
for i in range(12):
for j in range(471):
...
将training data分割为x,y:
#parse data to trainX and trainY,分列
x = []
y = []
for i in range(12): #12个月
for j in range(471): #20天*24个小时
x.append([])
for t in range(18): #18种物质
for s in range(9): #每天的前9个小时用于训练
x[471*i + j].append(data[t][480*i + j + s]) #480
y.append(data[9][480*i+j+9]) #第十个小时的PM2.5值
trainX = np.array(x) #每一行有9*18个数,每9个代表9天的某一种污染物
trainY = np.array(y)
2)test.csv
与training data中的x格式类似,导入到test_x列表中
#parse test data
test_x = [] #横轴每行9*18个数字,表示18种物质每小时的含量
n_row = 0
text = open('data/test.csv' ,"r")
row = csv.reader(text , delimiter= ",")
for r in row:
if n_row %18 == 0: #纵轴为天数
test_x.append([])
for i in range(2,11):
test_x[n_row//18].append(float(r[i]) )
else :
for i in range(2,11):
if r[i] !="NR": #标准化
test_x[n_row//18].append(float(r[i]))
else:
test_x[n_row//18].append(0)
n_row = n_row+1
text.close()
test_x = np.array(test_x)
3)ans.csv
第9天PM2.5的真实值
#parse anser
ans_y = []
n_row = 0
text = open('data/ans.csv', "r")
row = csv.reader(text, delimiter=",")
'''
自定义分隔符默认的情况下, 读和写使用逗号做分隔符(delimiter),
用双引号作为引用符(quotechar),当遇到特殊情况是,可以根据需要手动指定字符
'''
for r in row:
ans_y.append(r[1])
ans_y = ans_y[1:] #截取,去掉第0个元素('value')
ans_y = np.array(list(map(int, ans_y))) #将元素转换为整型
二、训练模型
- GD
def GD(X, Y, w, eta, iteration, lambdaL2): #Gradient Descent
#Loss for summation over all training examples
list_cost = []
for i in range(iteration):
hypo = np.dot(X,w) #预测值
loss = hypo - Y #误差
cost = np.sum(loss**2)/len(X) #Loss Function
list_cost.append(cost) #显示Loss Function的变化
grad = np.dot(X.T, loss)/len(X) + lambdaL2 * w #求梯度
w = w - eta*grad
return w, list_cost
- Adagrad
def ada(X, Y, w, eta, iteration, lambdaL2): #adagrad
s_grad = np.zeros(len(X[0]))
#全零矩阵,X为test,len(X[0])=9*18
list_cost = []
for i in range(iteration): #iteration迭代次数
hypo = np.dot(X,w) #向量乘法
loss = hypo - Y #差值
cost = np.sum(loss**2)/len(X) #np.sum求和
list_cost.append(cost) #cost变化趋势减小收敛
grad = np.dot(X.T, loss)/len(X) + lambdaL2*w
#.T转置,grad是梯度,梯度越大,步伐越大
s_grad += grad**2 #对之前的梯度求和
ada = np.sqrt(s_grad) #约束,使步伐越来越小
w = w - eta*grad/ada #eta学习率
return w, list_cost
- SGD
def SGD(X, Y, w, eta, iteration, lambdaL2): #Stochastic Gradient Descent #Loss for only one example list_cost = [] for i in range(iteration): hypo = np.dot(X,w) loss = hypo - Y cost = np.sum(loss**2)/len(X) list_cost.append(cost) rand = np.random.randint(0, len(X)) #生成一个随机整数 grad = X[rand]*loss[rand]/len(X) + lambdaL2*w #只对某个随机训练样本求梯度 w = w - eta*grad return w, list_cost
三、最小二乘法
w_cf = inv(trainX.T.dot(trainX)).dot(trainX.T).dot(trainY) #inv矩阵求逆,最小二乘法求w
完整代码:
import csv, os
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import inv
import random
import math
import sys
def ada(X, Y, w, eta, iteration, lambdaL2):
s_grad = np.zeros(len(X[0]))
list_cost = []
for i in range(iteration):
hypo = np.dot(X,w)
loss = hypo - Y
cost = np.sum(loss**2)/len(X)
list_cost.append(cost)
grad = np.dot(X.T, loss)/len(X) + lambdaL2*w
s_grad += grad**2
ada = np.sqrt(s_grad)
w = w - eta*grad/ada
return w, list_cost
def SGD(X, Y, w, eta, iteration, lambdaL2): #Stochastic Gradient Descent
#Loss for only one example
list_cost = []
for i in range(iteration):
hypo = np.dot(X,w)
loss = hypo - Y
cost = np.sum(loss**2)/len(X)
list_cost.append(cost)
rand = np.random.randint(0, len(X)) #生成一个随机整数
grad = X[rand]*loss[rand]/len(X) + lambdaL2*w #只对某个随机训练样本求梯度
w = w - eta*grad
return w, list_cost
def GD(X, Y, w, eta, iteration, lambdaL2): #Gradient Descent
#Loss for summation over all training examples
list_cost = []
for i in range(iteration):
hypo = np.dot(X,w) #预测值
loss = hypo - Y #误差
cost = np.sum(loss**2)/len(X) #Loss Function
list_cost.append(cost) #显示Loss Function的变化
grad = np.dot(X.T, loss)/len(X) + lambdaL2 * w #求梯度
w = w - eta*grad
return w, list_cost
# 每一个维度储存一种污染物的咨询
data = []
for i in range(18):
data.append([])
#read data
n_row = 0
text = open('data/train.csv', 'r', encoding='big5')
row = csv.reader(text, delimiter=',')
for r in row:
if n_row != 0:
for i in range(3,27):
if r[i] != "NR":
data[(n_row-1)%18].append(float(r[i]))
else:
data[(n_row-1)%18].append(float(0))
n_row = n_row + 1
text.close
#parse data to trainX and trainY
x = []
y = []
for i in range(12):
for j in range(471):
x.append([])
for t in range(18):
for s in range(9):
x[471*i + j].append(data[t][480*i+j+s])
y.append(data[9][480*i+j+9])
trainX = np.array(x) #每一行有9*18个数 每9个代表9天的某一种污染物
trainY = np.array(y)
#parse test data
test_x = []
n_row = 0
text = open('data/test.csv' ,"r")
row = csv.reader(text , delimiter= ",")
for r in row:
if n_row %18 == 0:
test_x.append([])
for i in range(2,11):
test_x[n_row//18].append(float(r[i]) )
else :
for i in range(2,11):
if r[i] !="NR":
test_x[n_row//18].append(float(r[i]))
else:
test_x[n_row//18].append(0)
n_row = n_row+1
text.close()
test_x = np.array(test_x)
#parse anser
ans_y = []
n_row = 0
text = open('data/ans.csv', "r")
row = csv.reader(text, delimiter=",")
for r in row:
ans_y.append(r[1])
ans_y = ans_y[1:]
ans_y = np.array(list(map(int, ans_y)))
# add bias
test_x = np.concatenate((np.ones((test_x.shape[0],1)),test_x), axis=1)
trainX = np.concatenate((np.ones((trainX.shape[0],1)), trainX), axis=1)
#train data
w = np.zeros(len(trainX[0]))
w_sgd, cost_list_sgd = SGD(trainX, trainY, w, eta=0.0001, iteration=20000, lambdaL2=0)
# w_sgd50, cost_list_sgd50 = SGD(trainX, trainY, w, eta=0.0001, iteration=20000, lambdaL2=50)
w_ada, cost_list_ada = ada(trainX, trainY, w, eta=1, iteration=20000, lambdaL2=0)
# w_gd, cost_list_gd = SGD(trainX, trainY, w, eta=0.0001, iteration=20000, lambdaL2=0)
#close form
w_cf = inv(trainX.T.dot(trainX)).dot(trainX.T).dot(trainY) #inv矩阵求逆,最小二乘法求w
cost_wcf = np.sum((trainX.dot(w_cf)-trainY)**2) / len(trainX)
hori = [cost_wcf for i in range(20000-3)]
#output testdata
y_ada = np.dot(test_x, w_ada)
y_sgd = np.dot(test_x, w_sgd)
y_cf = np.dot(test_x, w_cf)
#csv format
ans = []
for i in range(len(test_x)):
ans.append(["id_"+str(i)])
a = np.dot(w_ada,test_x[i])
ans[i].append(a)
#将训练结果写入文件
filename = "result/predict.csv"
text = open(filename, "w+")
s = csv.writer(text,delimiter=',',lineterminator='\n')
s.writerow(["id","value"])
for i in range(len(ans)):
s.writerow(ans[i])
text.close()
#plot training data with different gradiant method
plt.plot(np.arange(len(cost_list_ada[3:])), cost_list_ada[3:], 'b', label="ada")#蓝色实线表示Ada训练过程
plt.plot(np.arange(len(cost_list_sgd[3:])), cost_list_sgd[3:], 'g', label='sgd')#绿色实线表示Sgd训练过程
# plt.plot(np.arange(len(cost_list_sgd50[3:])), cost_list_sgd50[3:], 'c', label='sgd50')
# plt.plot(np.arange(len(cost_list_gd[3:])), cost_list_gd[3:], 'r', label='gd')
plt.plot(np.arange(len(cost_list_ada[3:])), hori, 'y--', label='close-form')
plt.title('Train Process')
plt.xlabel('Iteration')
plt.ylabel('Loss Function(Quadratic)')
plt.legend()
plt.savefig(os.path.join(os.path.dirname(__file__), "figures/TrainProcess"))
plt.show()
plt.clf()
#plot fianl answer
plt.figure()
plt.subplot(131)
plt.title('CloseForm')
plt.xlabel('dataset')
plt.ylabel('pm2.5')
plt.plot(np.arange((len(ans_y))), ans_y, 'r,')
plt.plot(np.arange(240), y_cf, 'b')
plt.subplot(132)
plt.title('ada')
plt.xlabel('dataset')
plt.ylabel('pm2.5')
plt.plot(np.arange((len(ans_y))), ans_y, 'r,')
plt.plot(np.arange(240), y_ada, 'g')
plt.subplot(133)
plt.title('sgd')
plt.xlabel('dataset')
plt.ylabel('pm2.5')
plt.plot(np.arange((len(ans_y))), ans_y, 'r,')
plt.plot(np.arange(240), y_sgd, 'b')
plt.tight_layout()
plt.savefig(os.path.join(os.path.dirname(__file__), "figures/Compare"))
plt.show()
PS:有两张图片需要保存:figure1是训练过程,figure2是训练结果。程序运行完以后只看到了一张图片,因为savefig方法保存图片并不会重置画布,所以导致图片的相互叠加。所以保存图片后,使用plt.clf()重置画布。