xgboost 序列数据实战_xgboost长序列数据-优快云博客

本文链接：https://blog.youkuaiyun.com/fang156239305/article/details/121148790

本文详细介绍了如何利用Python的XGBoost库处理序列数据，通过实例展示了在后端开发中如何应用XGBoost进行高效预测和模型训练，帮助提升数据处理能力。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('C.test_data201803.csv')
df.time = pd.to_datetime(df.time)

在这里插入图片描述

df['KWH'].hist()
df['date'] = df.time.dt.date
plt.plot(df.groupby(['date']).agg({'KWH':'sum'}))
df['hour'] = df.time.dt.hour
plt.plot(df.groupby(['hour']).agg({'KWH':'sum'}))
df['dow'] = df.time.dt.dayofweek
plt.plot(df.groupby(['dow']).agg({'KWH':'sum'}))
tempdf = df.groupby(['date']).agg({'KWH':'sum'})
tempdf.loc[tempdf['KWH']==0]

hourRollingMeandf = pd.DataFrame()
for hour in df['hour'].unique():
    temp = pd.DataFrame(df.loc[(df['hour'])==hour]['KWH'].rolling(4).mean())
    temp.columns = ['KWH-hour']
    temp['date'] = df.loc[(df['hour'])==hour]['date']
    temp['hour'] = hour
    hourRollingMeandf = pd.concat([hourRollingMeandf,temp],axis=0)
df = df.merge(hourRollingMeandf,on=['date','hour'],how='outer')

DOWRollingMeandf = pd.DataFrame()
for dow in df['dow'].unique():
    temp = pd.DataFrame(df.loc[(df['dow'])==dow]['KWH'].rolling(28).mean())
    temp.columns = ['KWH-dow']
    temp['date'] = df.loc[(df['dow'])==dow]['date']
    temp['dow'] = dow
    DOWRollingMeandf = pd.concat([DOWRollingMeandf,temp],axis=0)
df = df.merge(DOWRollingMeandf,on=['date','dow'],how='outer')

for i in range(24):
    df['KWH-%s'%i] = df['KWH'].shift(i)
df = df.dropna()
df.head()

# fixed train test split instead of random split
# as random split is meaningless in a time-series dataset
# we have to make sure that our model is trained on historical datasets
# and works for future datasets
featureColumns = ['KWH-hour',
       'KWH-dow', 'KWH-0', 'KWH-1', 'KWH-2', 'KWH-3', 'KWH-4', 'KWH-5',
       'KWH-6', 'KWH-7', 'KWH-8', 'KWH-9', 'KWH-10', 'KWH-11', 'KWH-12',
       'KWH-13', 'KWH-14', 'KWH-15', 'KWH-16', 'KWH-17', 'KWH-18', 'KWH-19',
       'KWH-20', 'KWH-21', 'KWH-22', 'KWH-23']

# train model from Feb to Oct
# test on Nov and Dec
dataset_train = df[:-744]
dataset_test = df[744:]

X_train = dataset_train[featureColumns]
X_test = dataset_test[featureColumns]
y_train = dataset_train['KWH']
y_test = dataset_test['KWH']

val_size = int(X_train.shape[0]*0.8)
val_fold = list(-1*np.ones(X_train.shape[0]-val_size)) + list(np.zeros(val_size))
ps = PredefinedSplit(val_fold)

param_grid = [{
    "n_estimators": np.arange(50, 200, 50),
    'max_depth': np.arange(10, 50, 10),
    'learning_rate': np.arange(0.1,0.3,0.05),
    'colsample_bytree':np.arange(0.1,0.5,0.1),
    'subsample':np.arange(0.5,0.9,0.1),
    'alpha':[1]
}]
# fit_inverse_transform=True to make sure inverse transform available
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
xg_reg_grid_search = GridSearchCV(xg_reg, param_grid, cv=ps, scoring='r2')
xg_reg_grid_search.fit(X_train, y_train)


GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, mo...
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             param_grid=[{'alpha': [1],
                          'colsample_bytree': array([0.1, 0.2, 0.3, 0.4]),
                          'learning_rate': array([0.1 , 0.15, 0.2 , 0.25]),
                          'max_depth': array([10, 20, 30, 40]),
                          'n_estimators': array([ 50, 100, 150]),
                          'subsample': array([0.5, 0.6, 0.7, 0.8])}],
             scoring='r2')


# mape could be probmatic as is may lead to division by 0
# add 1 to both residual and real value to prevent infinite values
print('insample MAPE',mean_absolute_percentage_error(y_train+1,xg_reg_grid_search.predict(X_train)+1))
print('outofsample MAPE',mean_absolute_percentage_error(y_pred+1,y_test+1))


insample MAPE 0.017108966492028255
outofsample MAPE 0.1405509639964891