import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('C.test_data201803.csv')
df.time = pd.to_datetime(df.time)
# fixed train test split instead of random split# as random split is meaningless in a time-series dataset# we have to make sure that our model is trained on historical datasets# and works for future datasets
featureColumns =['KWH-hour','KWH-dow','KWH-0','KWH-1','KWH-2','KWH-3','KWH-4','KWH-5','KWH-6','KWH-7','KWH-8','KWH-9','KWH-10','KWH-11','KWH-12','KWH-13','KWH-14','KWH-15','KWH-16','KWH-17','KWH-18','KWH-19','KWH-20','KWH-21','KWH-22','KWH-23']# train model from Feb to Oct# test on Nov and Dec
dataset_train = df[:-744]
dataset_test = df[744:]
X_train = dataset_train[featureColumns]
X_test = dataset_test[featureColumns]
y_train = dataset_train['KWH']
y_test = dataset_test['KWH']
val_size =int(X_train.shape[0]*0.8)
val_fold =list(-1*np.ones(X_train.shape[0]-val_size))+list(np.zeros(val_size))
ps = PredefinedSplit(val_fold)
param_grid =[{"n_estimators": np.arange(50,200,50),'max_depth': np.arange(10,50,10),'learning_rate': np.arange(0.1,0.3,0.05),'colsample_bytree':np.arange(0.1,0.5,0.1),'subsample':np.arange(0.5,0.9,0.1),'alpha':[1]}]# fit_inverse_transform=True to make sure inverse transform available
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
xg_reg_grid_search = GridSearchCV(xg_reg, param_grid, cv=ps, scoring='r2')
xg_reg_grid_search.fit(X_train, y_train)
GridSearchCV(cv=PredefinedSplit(test_fold=array([-1,-1,...,0,0])),
estimator=XGBRegressor(base_score=None, booster=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, gamma=None,
gpu_id=None, importance_type='gain',
interaction_constraints=None,
learning_rate=None, max_delta_step=None,
max_depth=None, min_child_weight=None,
missing=nan, mo...
reg_alpha=None, reg_lambda=None,
scale_pos_weight=None, subsample=None,
tree_method=None, validate_parameters=None,
verbosity=None),
param_grid=[{'alpha':[1],'colsample_bytree': array([0.1,0.2,0.3,0.4]),'learning_rate': array([0.1,0.15,0.2,0.25]),'max_depth': array([10,20,30,40]),'n_estimators': array([50,100,150]),'subsample': array([0.5,0.6,0.7,0.8])}],
scoring='r2')# mape could be probmatic as is may lead to division by 0# add 1 to both residual and real value to prevent infinite valuesprint('insample MAPE',mean_absolute_percentage_error(y_train+1,xg_reg_grid_search.predict(X_train)+1))print('outofsample MAPE',mean_absolute_percentage_error(y_pred+1,y_test+1))
insample MAPE 0.017108966492028255
outofsample MAPE 0.1405509639964891