GBDT滚动预测

该博客介绍了如何使用GBDT进行滚动预测,通过导入相关库并设置参数,使用ShuffleSplit进行交叉验证,利用GridSearchCV寻找最佳参数,并在数据集上训练和更新模型,以实现对ILI指数的预测。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import matplotlib.pylab as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sys


plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.rcParams['figure.figsize'] = 15, 6


###model3
# coding=utf-8
#statsmodel's verion should be overthan 0.7
import requests, pandas as pd, numpy as np
from pandas import DataFrame
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
# from xgboost import XGBRegressor
# import xgboost as xgb
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.svm import SVR, LinearSVR
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,mean_absolute_error,make_scorer
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import ensemble
from sklearn import cross_validation
#from sklearn import grid_search
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import Lasso,LassoCV
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import ShuffleSplit,GridSearchCV,train_test_split
rcParams['figure.figsize'] = 15, 6





def GBDT_train_predict_rolling(dff, roll_begin_index = 101, para_search_step = 20, cv_score = 'average_precision'):
    #dff first column is y, other columns is x1,x2,x3...


    train = dff.iloc[:roll_begin_index,] #第一条数
    test = dff.iloc[roll_begin_index:,]
    cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)


    clf = GridSearchCV(estimator=GradientBoostingClassifier(),
                                   param_grid={
                                               'learning_rate': [0.1, 1],
                                               'n_estimators': [200,400,600,800],
                                               'max_depth': [3, 5, 7, 9]},
                                   cv = 5,
                                   scoring= cv_score) #scoring see: http://scikit-learn.org/stable/modules/model_evaluation.html
    #clf.fit(train[train.columns[1:]].values, MultiLabelBinarizer().fit_transform(DataFrame(train[train.columns[0]]).values))
    clf.fit(train[train.columns[1:]].values,DataFrame(train[train.columns[0]]).values)
    classifier = clf.best_estimator_
    best_scores = []
    output = []
    change_flag = para_search_step
    for i in range(len(test)):
        change_flag = change_flag-1
        if change_flag == 0:
            change_flag = para_search_step
            print ('re-search best para, please wait ...')
            #clf.fit(train[train.columns[1:]].values, MultiLabelBinarizer().fit_transform(DataFrame(train[train.columns[0]]).values))
            clf.fit(train[train.columns[1:]].values,DataFrame(train[train.columns[0]]).values)
            classifier = clf.best_estimator_
        best_scores.append(clf.best_score_)
        output.append(classifier.predict(test.iloc[[i]][test.iloc[[i]].columns[1:]].values))
        train = pd.concat([train,test.iloc[[i]]]) #更新train
        print ('Training Set sizes:', len(train), 'score: ', clf.best_score_)
    pre = pd.DataFrame(output,columns=['predicted'])
    pre.index = test.index
    df_re = pd.concat([test[[test.columns[0]]],pre],axis =1)
    df_re = df_re.rename(columns = {test.columns[0]:'observed'})
    df_re.index = pd.to_datetime(df_re.index)


    scores_re = pd.DataFrame(best_scores,columns=[cv_score])
    scores_re.index = test.index
    return df_re, scores_re




###
##导入数据
df=pd.ExcelFile(r'D:\swx_ts_prediction\ILI_baidu_week2.xlsx', encoding='gbk',index_col  = 'date').parse('Sheet1')


df.index=df['date']
df.index = pd.to_datetime(df.index)
#数据处理
dx = df[df.columns[2:]]
dfy = df[[df.columns[1]]]
dx=dx.astype(np.float)




##分级数据
df_fenji=pd.ExcelFile(r'D:\swx_ts_prediction\ILI_baidu_week2_fenji.xlsx', encoding='gbk',index_col  = 'date').parse('Sheet1')


df_fenji.index=df_fenji['date']
df_fenji.index = pd.to_datetime(df_fenji.index)
dfy_fenji = df_fenji[[df_fenji.columns[1]]]




dfx = dx.shift(1) #根据前面的结果,天气后移一周相关性最大,所以天气滞后一周
dfy1=dfy.shift(1).dropna()
dfy2=dfy.shift(2).dropna()
dfy27=dfy.shift(27).dropna()


dfy1.columns=[u'ILI指数1']
dfy2.columns=[u'ILI指数2']
dfy27.columns=[u'ILI指数27']
xy = dfy.join([dfx,dfy1,dfy2,dfy27]).dropna()


pcc_tr = xy.corr()
select_index = pcc_tr[abs(pcc_tr[pcc_tr.columns[0]]) > 0.25].index
xy = xy[select_index]


xy_fenji=dfy_fenji.join([xy[xy.columns[1:]]]).dropna()
fenji=DataFrame(xy_fenji[xy_fenji.columns[0]])




df_re, scores_re=GBDT_train_predict_rolling(xy_fenji, roll_begin_index = 101, para_search_step = 20, cv_score = 'average_precision')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值