时间,把字符串改成日期是原样,数字则从1970开始

df1['date1'] = df1['DATE'].astype(str)#转换成字符串,挺重要
print(df1.tail(1))
df1['date1'] =pd.to_datetime(df1['date1'])
df1['week1']=df1['date1'].dt.isocalendar().week
df1['year1']=df1['date1'].dt.isocalendar().year#要是不用isocalendar,则是哪一年就哪一年,第一个月的1日容日出错

 

用了是这样,不然会是2021,week53,因为本质上不是一个函数

    TSCODE      DATE  ROWNUM  CLOSE      date1  week1  year1
0   SZ000001  20210101      10   14.0 2021-01-01     53   2020
1   SZ000001  20210102       9   11.0 2021-01-02     53   2020
2   SZ000001  20210103       8   13.0 2021-01-03     53   2020

 

 

 

 

 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import math
from math import sqrt
from datetime import datetime
import copy

np.random.seed(2019)
seed = 2019

def rmse(y_pred,y_true):
    return sqrt(mean_squared_error(y_pred,y_true))

sample = pd.read_csv('sample_500k_0719.csv')

# drop purcharse_ar column
data = sample.drop('purchase_ar', axis=1)

data.drop('Unnamed: 0', axis=1, inplace=True)

# find time-series characteristics
from scipy import optimize

data.sort_values(by=['becif', 'dt'], inplace=True)

def calc_shifted_ewm(series, alpha, adjust=True):
    return series.shift().ewm(alpha=alpha, adjust=adjust).mean()

def find_best_signal(series, adjust=False, eps=10e-5):

    def f(alpha):
        shifted_ewm = calc_shifted_ewm(series=series, alpha=min(max(alpha, 0), 1), adjust=adjust)
        corr = np.mean(np.power(series - shifted_ewm, 2))
        return corr
     
    res = optimize.differential_evolution(func=f, bounds=[(0 + eps, 1 - eps)])

    return calc_shifted_ewm(series=series, alpha=res['x'][0], adjust=adjust)

roll = data.groupby(['becif', 'dt']).apply(lambda g: calc_shifted_ewm(g['purchase_ar_next'], 0.96))
data['optimized_ewm_by_becif'] = roll.sort_index(level=['becif', 'dt']).values


# get dummies
data =pd.get_dummies(data,columns=['month'])

# train/test
data['par_next_log1p'] = np.log1p(data['purchase_ar_next'])


train = data[data['dt'] < 201900]
train = train.dropna()


test = data[data['dt'] > 201900]


X_train = train.drop('par_next_log1p', axis='columns')
X_test = test.drop('par_next_log1p', axis='columns')
y_train = train['par_next_log1p']

# check datasets
assert X_train.isnull().sum().sum() == 0
assert y_train.isnull().sum() == 0
assert len(X_train) == len(y_train)
assert X_test.isnull().sum().sum() == 0

import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection


model = lgb.LGBMRegressor(objective='regression',
                           max_depth=6,
                           num_leaves=31,
                           learning_rate=0.01,
                           n_estimators=1000,
                           min_child_samples=80,
                           subsample=0.8,
                           random_state=np.random.randint(10e6))

n_splits = 6
cv = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=42)

val_scores = [0] * n_splits

sub = X_test[['becif', 'dt']]
sub['par_pred'] = 0


feature_importances = pd.DataFrame(index=X_train.columns)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
    X_fit = X_train.iloc[fit_idx]
    y_fit = y_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    
    model.fit(X_fit,y_fit,
              eval_set=[(X_fit, y_fit), (X_val, y_val)],
              eval_names=('fit', 'val'),
              eval_metric='l2',
              early_stopping_rounds=200,
              feature_name=X_fit.columns.tolist(),
              verbose=False
             )
    
    val_scores[i] = np.sqrt(model.best_score_['val']['l2'])
    sub['par_pred'] += model.predict(X_test, num_iteration=model.best_iteration_)
    feature_importances[i] = model.feature_importances_
    
    print('Fold {} RMSLE: {:.5f}'.format(i+1, val_scores[i]))

val_mean = np.mean(val_scores)
val_std = np.std(val_scores)

print('Local RMSLE: {:.5f} (±{:.5f})'.format(val_mean, val_std))


result = pd.merge(sub, sample, on=['becif', 'dt'], how='left')

res = result[['becif', 'dt', 'par_pred', 'purchase_ar', 'purchase_ar_next']]

res.to_csv('result_0720_cross_validated.csv')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值