df1['date1'] = df1['DATE'].astype(str)#转换成字符串,挺重要 print(df1.tail(1)) df1['date1'] =pd.to_datetime(df1['date1']) df1['week1']=df1['date1'].dt.isocalendar().week df1['year1']=df1['date1'].dt.isocalendar().year#要是不用isocalendar,则是哪一年就哪一年,第一个月的1日容日出错
用了是这样,不然会是2021,week53,因为本质上不是一个函数
TSCODE DATE ROWNUM CLOSE date1 week1 year1
0 SZ000001 20210101 10 14.0 2021-01-01 53 2020
1 SZ000001 20210102 9 11.0 2021-01-02 53 2020
2 SZ000001 20210103 8 13.0 2021-01-03 53 2020
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import math
from math import sqrt
from datetime import datetime
import copy
np.random.seed(2019)
seed = 2019
def rmse(y_pred,y_true):
return sqrt(mean_squared_error(y_pred,y_true))
sample = pd.read_csv('sample_500k_0719.csv')
# drop purcharse_ar column
data = sample.drop('purchase_ar', axis=1)
data.drop('Unnamed: 0', axis=1, inplace=True)
# find time-series characteristics
from scipy import optimize
data.sort_values(by=['becif', 'dt'], inplace=True)
def calc_shifted_ewm(series, alpha, adjust=True):
return series.shift().ewm(alpha=alpha, adjust=adjust).mean()
def find_best_signal(series, adjust=False, eps=10e-5):
def f(alpha):
shifted_ewm = calc_shifted_ewm(series=series, alpha=min(max(alpha, 0), 1), adjust=adjust)
corr = np.mean(np.power(series - shifted_ewm, 2))
return corr
res = optimize.differential_evolution(func=f, bounds=[(0 + eps, 1 - eps)])
return calc_shifted_ewm(series=series, alpha=res['x'][0], adjust=adjust)
roll = data.groupby(['becif', 'dt']).apply(lambda g: calc_shifted_ewm(g['purchase_ar_next'], 0.96))
data['optimized_ewm_by_becif'] = roll.sort_index(level=['becif', 'dt']).values
# get dummies
data =pd.get_dummies(data,columns=['month'])
# train/test
data['par_next_log1p'] = np.log1p(data['purchase_ar_next'])
train = data[data['dt'] < 201900]
train = train.dropna()
test = data[data['dt'] > 201900]
X_train = train.drop('par_next_log1p', axis='columns')
X_test = test.drop('par_next_log1p', axis='columns')
y_train = train['par_next_log1p']
# check datasets
assert X_train.isnull().sum().sum() == 0
assert y_train.isnull().sum() == 0
assert len(X_train) == len(y_train)
assert X_test.isnull().sum().sum() == 0
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
model = lgb.LGBMRegressor(objective='regression',
max_depth=6,
num_leaves=31,
learning_rate=0.01,
n_estimators=1000,
min_child_samples=80,
subsample=0.8,
random_state=np.random.randint(10e6))
n_splits = 6
cv = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=42)
val_scores = [0] * n_splits
sub = X_test[['becif', 'dt']]
sub['par_pred'] = 0
feature_importances = pd.DataFrame(index=X_train.columns)
for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
X_fit = X_train.iloc[fit_idx]
y_fit = y_train.iloc[fit_idx]
X_val = X_train.iloc[val_idx]
y_val = y_train.iloc[val_idx]
model.fit(X_fit,y_fit,
eval_set=[(X_fit, y_fit), (X_val, y_val)],
eval_names=('fit', 'val'),
eval_metric='l2',
early_stopping_rounds=200,
feature_name=X_fit.columns.tolist(),
verbose=False
)
val_scores[i] = np.sqrt(model.best_score_['val']['l2'])
sub['par_pred'] += model.predict(X_test, num_iteration=model.best_iteration_)
feature_importances[i] = model.feature_importances_
print('Fold {} RMSLE: {:.5f}'.format(i+1, val_scores[i]))
val_mean = np.mean(val_scores)
val_std = np.std(val_scores)
print('Local RMSLE: {:.5f} (±{:.5f})'.format(val_mean, val_std))
result = pd.merge(sub, sample, on=['becif', 'dt'], how='left')
res = result[['becif', 'dt', 'par_pred', 'purchase_ar', 'purchase_ar_next']]
res.to_csv('result_0720_cross_validated.csv')