import matplotlib.pylab as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sys
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.rcParams['figure.figsize'] = 15, 6
###model3
# coding=utf-8
#statsmodel's verion should be overthan 0.7
import requests, pandas as pd, numpy as np
from pandas import DataFrame
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
# from xgboost import XGBRegressor
# import xgboost as xgb
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.svm import SVR, LinearSVR
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,mean_absolute_error,make_scorer
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import ensemble
from sklearn import cross_validation
#from sklearn import grid_search
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import Lasso,LassoCV
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import ShuffleSplit,GridSearchCV,train_test_split
rcParams['figure.figsize'] = 15, 6
def GBDT_train_predict_rolling(dff, roll_begin_index = 101, para_search_step = 20, cv_score = 'average_precision'):
#dff first column is y, other columns is x1,x2,x3...
train = dff.iloc[:roll_begin_index,] #第一条数
test = dff.iloc[roll_begin_index:,]
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
clf = GridSearchCV(estimator=GradientBoostingClassifier(),
param_grid={
'learning_rate': [0.1, 1],
'n_estimators': [200,400,600,800],
'max_depth': [3, 5, 7, 9]},
cv = 5,
scoring= cv_score) #scoring see: http://scikit-learn.org/stable/modules/model_evaluation.html
#clf.fit(train[train.columns[1:]].values, MultiLabelBinarizer().fit_transform(DataFrame(train[train.columns[0]]).values))
clf.fit(train[train.columns[1:]].values,DataFrame(train[train.columns[0]]).values)
classifier = clf.best_estimator_
best_scores = []
output = []
change_flag = para_search_step
for i in range(len(test)):
change_flag = change_flag-1
if change_flag == 0:
change_flag = para_search_step
print ('re-search best para, please wait ...')
#clf.fit(train[train.columns[1:]].values, MultiLabelBinarizer().fit_transform(DataFrame(train[train.columns[0]]).values))
clf.fit(train[train.columns[1:]].values,DataFrame(train[train.columns[0]]).values)
classifier = clf.best_estimator_
best_scores.append(clf.best_score_)
output.append(classifier.predict(test.iloc[[i]][test.iloc[[i]].columns[1:]].values))
train = pd.concat([train,test.iloc[[i]]]) #更新train
print ('Training Set sizes:', len(train), 'score: ', clf.best_score_)
pre = pd.DataFrame(output,columns=['predicted'])
pre.index = test.index
df_re = pd.concat([test[[test.columns[0]]],pre],axis =1)
df_re = df_re.rename(columns = {test.columns[0]:'observed'})
df_re.index = pd.to_datetime(df_re.index)
scores_re = pd.DataFrame(best_scores,columns=[cv_score])
scores_re.index = test.index
return df_re, scores_re
###
##导入数据
df=pd.ExcelFile(r'D:\swx_ts_prediction\ILI_baidu_week2.xlsx', encoding='gbk',index_col = 'date').parse('Sheet1')
df.index=df['date']
df.index = pd.to_datetime(df.index)
#数据处理
dx = df[df.columns[2:]]
dfy = df[[df.columns[1]]]
dx=dx.astype(np.float)
##分级数据
df_fenji=pd.ExcelFile(r'D:\swx_ts_prediction\ILI_baidu_week2_fenji.xlsx', encoding='gbk',index_col = 'date').parse('Sheet1')
df_fenji.index=df_fenji['date']
df_fenji.index = pd.to_datetime(df_fenji.index)
dfy_fenji = df_fenji[[df_fenji.columns[1]]]
dfx = dx.shift(1) #根据前面的结果,天气后移一周相关性最大,所以天气滞后一周
dfy1=dfy.shift(1).dropna()
dfy2=dfy.shift(2).dropna()
dfy27=dfy.shift(27).dropna()
dfy1.columns=[u'ILI指数1']
dfy2.columns=[u'ILI指数2']
dfy27.columns=[u'ILI指数27']
xy = dfy.join([dfx,dfy1,dfy2,dfy27]).dropna()
pcc_tr = xy.corr()
select_index = pcc_tr[abs(pcc_tr[pcc_tr.columns[0]]) > 0.25].index
xy = xy[select_index]
xy_fenji=dfy_fenji.join([xy[xy.columns[1:]]]).dropna()
fenji=DataFrame(xy_fenji[xy_fenji.columns[0]])
df_re, scores_re=GBDT_train_predict_rolling(xy_fenji, roll_begin_index = 101, para_search_step = 20, cv_score = 'average_precision')
import seaborn as sns
import pandas as pd
import numpy as np
import sys
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.rcParams['figure.figsize'] = 15, 6
###model3
# coding=utf-8
#statsmodel's verion should be overthan 0.7
import requests, pandas as pd, numpy as np
from pandas import DataFrame
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
# from xgboost import XGBRegressor
# import xgboost as xgb
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.svm import SVR, LinearSVR
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,mean_absolute_error,make_scorer
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import ensemble
from sklearn import cross_validation
#from sklearn import grid_search
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import Lasso,LassoCV
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import ShuffleSplit,GridSearchCV,train_test_split
rcParams['figure.figsize'] = 15, 6
def GBDT_train_predict_rolling(dff, roll_begin_index = 101, para_search_step = 20, cv_score = 'average_precision'):
#dff first column is y, other columns is x1,x2,x3...
train = dff.iloc[:roll_begin_index,] #第一条数
test = dff.iloc[roll_begin_index:,]
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
clf = GridSearchCV(estimator=GradientBoostingClassifier(),
param_grid={
'learning_rate': [0.1, 1],
'n_estimators': [200,400,600,800],
'max_depth': [3, 5, 7, 9]},
cv = 5,
scoring= cv_score) #scoring see: http://scikit-learn.org/stable/modules/model_evaluation.html
#clf.fit(train[train.columns[1:]].values, MultiLabelBinarizer().fit_transform(DataFrame(train[train.columns[0]]).values))
clf.fit(train[train.columns[1:]].values,DataFrame(train[train.columns[0]]).values)
classifier = clf.best_estimator_
best_scores = []
output = []
change_flag = para_search_step
for i in range(len(test)):
change_flag = change_flag-1
if change_flag == 0:
change_flag = para_search_step
print ('re-search best para, please wait ...')
#clf.fit(train[train.columns[1:]].values, MultiLabelBinarizer().fit_transform(DataFrame(train[train.columns[0]]).values))
clf.fit(train[train.columns[1:]].values,DataFrame(train[train.columns[0]]).values)
classifier = clf.best_estimator_
best_scores.append(clf.best_score_)
output.append(classifier.predict(test.iloc[[i]][test.iloc[[i]].columns[1:]].values))
train = pd.concat([train,test.iloc[[i]]]) #更新train
print ('Training Set sizes:', len(train), 'score: ', clf.best_score_)
pre = pd.DataFrame(output,columns=['predicted'])
pre.index = test.index
df_re = pd.concat([test[[test.columns[0]]],pre],axis =1)
df_re = df_re.rename(columns = {test.columns[0]:'observed'})
df_re.index = pd.to_datetime(df_re.index)
scores_re = pd.DataFrame(best_scores,columns=[cv_score])
scores_re.index = test.index
return df_re, scores_re
###
##导入数据
df=pd.ExcelFile(r'D:\swx_ts_prediction\ILI_baidu_week2.xlsx', encoding='gbk',index_col = 'date').parse('Sheet1')
df.index=df['date']
df.index = pd.to_datetime(df.index)
#数据处理
dx = df[df.columns[2:]]
dfy = df[[df.columns[1]]]
dx=dx.astype(np.float)
##分级数据
df_fenji=pd.ExcelFile(r'D:\swx_ts_prediction\ILI_baidu_week2_fenji.xlsx', encoding='gbk',index_col = 'date').parse('Sheet1')
df_fenji.index=df_fenji['date']
df_fenji.index = pd.to_datetime(df_fenji.index)
dfy_fenji = df_fenji[[df_fenji.columns[1]]]
dfx = dx.shift(1) #根据前面的结果,天气后移一周相关性最大,所以天气滞后一周
dfy1=dfy.shift(1).dropna()
dfy2=dfy.shift(2).dropna()
dfy27=dfy.shift(27).dropna()
dfy1.columns=[u'ILI指数1']
dfy2.columns=[u'ILI指数2']
dfy27.columns=[u'ILI指数27']
xy = dfy.join([dfx,dfy1,dfy2,dfy27]).dropna()
pcc_tr = xy.corr()
select_index = pcc_tr[abs(pcc_tr[pcc_tr.columns[0]]) > 0.25].index
xy = xy[select_index]
xy_fenji=dfy_fenji.join([xy[xy.columns[1:]]]).dropna()
fenji=DataFrame(xy_fenji[xy_fenji.columns[0]])
df_re, scores_re=GBDT_train_predict_rolling(xy_fenji, roll_begin_index = 101, para_search_step = 20, cv_score = 'average_precision')