【机器学习】李宏毅HW1：预测PM2.5

最新推荐文章于 2023-03-26 12:51:24 发布

小河梦

最新推荐文章于 2023-03-26 12:51:24 发布

阅读量2.3k

点赞数 7

分类专栏：机器学习文章标签：机器学习 python

本文链接：https://blog.youkuaiyun.com/weixin_42182906/article/details/94994702

版权

机器学习专栏收录该内容

3 篇文章

订阅专栏

https://github.com/maplezzz/NTU_ML2017_Hung-yi-Lee_HW/tree/master/HW1

题目要求：

预测丰原地区一段时间内的PM2.5值，

train.csv:每个月前20天每个小时的气象资料（每小时有18种测资）共12个月
test.csv: 排除train.csv剩余的资料取连续9小时的资料当作feature 预测第10小时的PM2.5值总共240笔
ans.csv test的answer

线性预测方法：

Gradient Descent:

$L=\sum_{n}^{ }(\hat{y}^{n}-(b+\sum w_{i}x_{i}^{n}))^{2}$

$w^{t+1}\leftarrow w^{t}-\eta g^{t}$

Adagrad:Divide the learn rate of each parameters by the root mean square of its previous derivatives.

$w^{t+1}\leftarrow w^{t}-\frac{\eta ^{t} }{\sqrt{\sum_{0}^{t}(g^{i})^{2}} }g^{t}$

SGD(Stochastic Gradient Descent):Update for each example.

$L^{n} =(\hat{y}^{n}-(b+\sum w_{i}x_{i}^{n}))^{2}$

解题过程：

一、数据处理：

1)train.csv:

导入到data列表（training data）中，纵轴为18种物质，横轴为时间：24小时*20天*12个月=5760小时。

#read data,18种物质*每天每个时刻
n_row = 0
text = open('data/train.csv', 'r', encoding='big5')
#GB码与BIG5是中国人常用的两种编码集。GB码为大陆使用，BIG5为香港与台湾使用。
row = csv.reader(text, delimiter=',')   #将每行读取的值作为列表返回
for r in row:                           #读取每一行
    if n_row != 0:                      #从原表格中第一行开始读取
        for i in range(3,27):           #每一个数据项
            if r[i] != "NR":
                data[(n_row-1)%18].append(float(r[i]))      #向data中添加数据
            else:
                data[(n_row-1)%18].append(float(0))
    n_row = n_row + 1
text.close

将training data分为x和y，x是前九个小时的数据，y是第10个小时的PM2.5含量。注意到时间轴是每个月的前二十天，所以不能连续地分组，即有些时刻不能用于预测。比如x的时间是[470-478]，预测第479小时的PM2.5含量，那么下一组x应该是[480-488]，而不是[471-479]。用双重for循环：

for i in range(12):
    for j in range(471):
        ...

将training data分割为x，y：

#parse data to trainX and trainY，分列
x = []
y = []
for i in range(12):                     #12个月
    for j in range(471):                #20天*24个小时
        x.append([])
        for t in range(18):             #18种物质
            for s in range(9):          #每天的前9个小时用于训练
                x[471*i + j].append(data[t][480*i + j + s])         #480
        y.append(data[9][480*i+j+9])    #第十个小时的PM2.5值
trainX = np.array(x)                    #每一行有9*18个数，每9个代表9天的某一种污染物
trainY = np.array(y)

2)test.csv

与training data中的x格式类似，导入到test_x列表中

#parse test data
test_x = []                             #横轴每行9*18个数字，表示18种物质每小时的含量
n_row = 0
text = open('data/test.csv' ,"r")
row = csv.reader(text , delimiter= ",")

for r in row:
    if n_row %18 == 0:                  #纵轴为天数
        test_x.append([])
        for i in range(2,11):
            test_x[n_row//18].append(float(r[i]) )
    else :
        for i in range(2,11):
            if r[i] !="NR":             #标准化
                test_x[n_row//18].append(float(r[i]))
            else:
                test_x[n_row//18].append(0)
    n_row = n_row+1
text.close()
test_x = np.array(test_x)

3）ans.csv

第9天PM2.5的真实值

#parse anser
ans_y = []
n_row = 0
text = open('data/ans.csv', "r")
row = csv.reader(text, delimiter=",")
'''
自定义分隔符默认的情况下, 读和写使用逗号做分隔符(delimiter)，
用双引号作为引用符(quotechar)，当遇到特殊情况是，可以根据需要手动指定字符
'''
for r in row:
    ans_y.append(r[1])

ans_y = ans_y[1:]                        #截取，去掉第0个元素('value')
ans_y = np.array(list(map(int, ans_y)))  #将元素转换为整型

二、训练模型

def GD(X, Y, w, eta, iteration, lambdaL2):              #Gradient Descent
    #Loss for summation over all training examples
    list_cost = []
    for i in range(iteration):
        hypo = np.dot(X,w)                              #预测值
        loss = hypo - Y                                 #误差
        cost = np.sum(loss**2)/len(X)                   #Loss Function
        list_cost.append(cost)                          #显示Loss Function的变化

        grad = np.dot(X.T, loss)/len(X) + lambdaL2 * w  #求梯度
        w = w - eta*grad
    return w, list_cost

Adagrad

def ada(X, Y, w, eta, iteration, lambdaL2):         #adagrad
    s_grad = np.zeros(len(X[0]))
    #全零矩阵，X为test，len(X[0])=9*18
    list_cost = []
    for i in range(iteration):                      #iteration迭代次数
        hypo = np.dot(X,w)                          #向量乘法
        loss = hypo - Y                             #差值
        cost = np.sum(loss**2)/len(X)               #np.sum求和
        list_cost.append(cost)                      #cost变化趋势减小收敛

        grad = np.dot(X.T, loss)/len(X) + lambdaL2*w
        #.T转置，grad是梯度，梯度越大，步伐越大
        s_grad += grad**2                           #对之前的梯度求和
        ada = np.sqrt(s_grad)                       #约束，使步伐越来越小
        w = w - eta*grad/ada                        #eta学习率
    return w, list_cost

SGD

def SGD(X, Y, w, eta, iteration, lambdaL2):             #Stochastic Gradient Descent
    #Loss for only one example
    list_cost = []
    for i in range(iteration):
        hypo = np.dot(X,w)
        loss = hypo - Y
        cost = np.sum(loss**2)/len(X)
        list_cost.append(cost)

        rand = np.random.randint(0, len(X))             #生成一个随机整数
        grad = X[rand]*loss[rand]/len(X) + lambdaL2*w   #只对某个随机训练样本求梯度
        w = w - eta*grad
    return w, list_cost

三、最小二乘法

w_cf = inv(trainX.T.dot(trainX)).dot(trainX.T).dot(trainY)          #inv矩阵求逆，最小二乘法求w

完整代码：

import csv, os
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import inv
import random
import math
import sys


def ada(X, Y, w, eta, iteration, lambdaL2):
    s_grad = np.zeros(len(X[0]))
    list_cost = []
    for i in range(iteration):
        hypo = np.dot(X,w)
        loss = hypo - Y
        cost = np.sum(loss**2)/len(X)
        list_cost.append(cost)

        grad = np.dot(X.T, loss)/len(X) + lambdaL2*w
        s_grad += grad**2
        ada = np.sqrt(s_grad)
        w = w - eta*grad/ada
    return w, list_cost


def SGD(X, Y, w, eta, iteration, lambdaL2):             #Stochastic Gradient Descent
    #Loss for only one example
    list_cost = []
    for i in range(iteration):
        hypo = np.dot(X,w)
        loss = hypo - Y
        cost = np.sum(loss**2)/len(X)
        list_cost.append(cost)

        rand = np.random.randint(0, len(X))             #生成一个随机整数
        grad = X[rand]*loss[rand]/len(X) + lambdaL2*w   #只对某个随机训练样本求梯度
        w = w - eta*grad
    return w, list_cost

def GD(X, Y, w, eta, iteration, lambdaL2):              #Gradient Descent
    #Loss for summation over all training examples
    list_cost = []
    for i in range(iteration):
        hypo = np.dot(X,w)                              #预测值
        loss = hypo - Y                                 #误差
        cost = np.sum(loss**2)/len(X)                   #Loss Function
        list_cost.append(cost)                          #显示Loss Function的变化

        grad = np.dot(X.T, loss)/len(X) + lambdaL2 * w  #求梯度
        w = w - eta*grad
    return w, list_cost



# 每一个维度储存一种污染物的咨询
data = []
for i in range(18):
    data.append([])


#read data
n_row = 0
text = open('data/train.csv', 'r', encoding='big5')
row = csv.reader(text, delimiter=',')
for r in row:
    if n_row != 0:
        for i in range(3,27):
            if r[i] != "NR":
                data[(n_row-1)%18].append(float(r[i]))
            else:
                data[(n_row-1)%18].append(float(0))
    n_row = n_row + 1
text.close


#parse data to trainX and trainY
x = []
y = []
for i in range(12):
    for j in range(471):
        x.append([])
        for t in range(18):
            for s in range(9):
                x[471*i + j].append(data[t][480*i+j+s])
        y.append(data[9][480*i+j+9])
trainX = np.array(x) #每一行有9*18个数 每9个代表9天的某一种污染物
trainY = np.array(y)

#parse test data
test_x = []
n_row = 0
text = open('data/test.csv' ,"r")
row = csv.reader(text , delimiter= ",")

for r in row:
    if n_row %18 == 0:
        test_x.append([])
        for i in range(2,11):
            test_x[n_row//18].append(float(r[i]) )
    else :
        for i in range(2,11):
            if r[i] !="NR":
                test_x[n_row//18].append(float(r[i]))
            else:
                test_x[n_row//18].append(0)
    n_row = n_row+1
text.close()
test_x = np.array(test_x)

#parse anser
ans_y = []
n_row = 0
text = open('data/ans.csv', "r")
row = csv.reader(text, delimiter=",")

for r in row:
    ans_y.append(r[1])

ans_y = ans_y[1:]
ans_y = np.array(list(map(int, ans_y)))


# add bias
test_x = np.concatenate((np.ones((test_x.shape[0],1)),test_x), axis=1)
trainX = np.concatenate((np.ones((trainX.shape[0],1)), trainX), axis=1)


#train data
w = np.zeros(len(trainX[0]))
w_sgd, cost_list_sgd = SGD(trainX, trainY, w, eta=0.0001, iteration=20000, lambdaL2=0)
# w_sgd50, cost_list_sgd50 = SGD(trainX, trainY, w, eta=0.0001, iteration=20000, lambdaL2=50)
w_ada, cost_list_ada = ada(trainX, trainY, w, eta=1, iteration=20000, lambdaL2=0)
# w_gd, cost_list_gd = SGD(trainX, trainY, w, eta=0.0001, iteration=20000, lambdaL2=0)

#close form
w_cf = inv(trainX.T.dot(trainX)).dot(trainX.T).dot(trainY)          #inv矩阵求逆，最小二乘法求w
cost_wcf = np.sum((trainX.dot(w_cf)-trainY)**2) / len(trainX)       
hori = [cost_wcf for i in range(20000-3)]



#output testdata
y_ada = np.dot(test_x, w_ada)
y_sgd = np.dot(test_x, w_sgd)
y_cf = np.dot(test_x, w_cf)

#csv format
ans = []
for i in range(len(test_x)):
    ans.append(["id_"+str(i)])
    a = np.dot(w_ada,test_x[i])
    ans[i].append(a)

#将训练结果写入文件
filename = "result/predict.csv"
text = open(filename, "w+")
s = csv.writer(text,delimiter=',',lineterminator='\n')
s.writerow(["id","value"])
for i in range(len(ans)):
    s.writerow(ans[i])
text.close()


#plot training data with different gradiant method
plt.plot(np.arange(len(cost_list_ada[3:])), cost_list_ada[3:], 'b', label="ada")#蓝色实线表示Ada训练过程
plt.plot(np.arange(len(cost_list_sgd[3:])), cost_list_sgd[3:], 'g', label='sgd')#绿色实线表示Sgd训练过程
# plt.plot(np.arange(len(cost_list_sgd50[3:])), cost_list_sgd50[3:], 'c', label='sgd50')
# plt.plot(np.arange(len(cost_list_gd[3:])), cost_list_gd[3:], 'r', label='gd')
plt.plot(np.arange(len(cost_list_ada[3:])), hori, 'y--', label='close-form')
plt.title('Train Process')
plt.xlabel('Iteration')
plt.ylabel('Loss Function(Quadratic)')
plt.legend()
plt.savefig(os.path.join(os.path.dirname(__file__), "figures/TrainProcess"))
plt.show()
plt.clf()

#plot fianl answer
plt.figure()
plt.subplot(131)
plt.title('CloseForm')
plt.xlabel('dataset')
plt.ylabel('pm2.5')
plt.plot(np.arange((len(ans_y))), ans_y, 'r,')
plt.plot(np.arange(240), y_cf, 'b')

plt.subplot(132)
plt.title('ada')
plt.xlabel('dataset')
plt.ylabel('pm2.5')
plt.plot(np.arange((len(ans_y))), ans_y, 'r,')
plt.plot(np.arange(240), y_ada, 'g')
plt.subplot(133)
plt.title('sgd')
plt.xlabel('dataset')
plt.ylabel('pm2.5')
plt.plot(np.arange((len(ans_y))), ans_y, 'r,')
plt.plot(np.arange(240), y_sgd, 'b')
plt.tight_layout()

plt.savefig(os.path.join(os.path.dirname(__file__), "figures/Compare"))
plt.show()

PS：有两张图片需要保存：figure1是训练过程，figure2是训练结果。程序运行完以后只看到了一张图片，因为savefig方法保存图片并不会重置画布，所以导致图片的相互叠加。所以保存图片后，使用plt.clf()重置画布。