# 同时满足两个条件
data = data.loc[(data.period!=-999)|(data.nuni!=-999)]
满足某条件
bid_nuni = bid_nuni[bid_nuni[‘nuni’]==1]
填充缺失值
data[‘nuni’] = data[‘nuni’].fillna(-999)
# 移除相同 样本
totalExposureLog = totalExposureLog.drop_duplicates(subset=[‘aid’,‘uid’,‘aid_location’,‘request_time’], keep=‘last’)#这里的last是取相似行的最后一行
# 移除pctr高于密集区的样本
totalExposureLog = totalExposureLog.loc[(totalExposureLog.pctr<=1000)]
# 移除quality_ecpm高于密集区的样本
totalExposureLog = totalExposureLog.loc[(totalExposureLog.quality_ecpm>=0)&(totalExposureLog.quality_ecpm<=80000)]
# 移除totalEcpm高于密集区的样本
totalExposureLog = totalExposureLog.loc[(totalExposureLog.totalEcpm<=120000)]
# 移除bid高于密集区的样本
totalExposureLog = totalExposureLog.loc[(totalExposureLog.bid<=15000)]
list转换为dataframe
crowd_feature = pd.DataFrame(crowd_data)
时间分割
def get_preprocessing(df_):
df = df_.copy()
df['hour'] = df['create_order_time'].apply(lambda x:int(x[11:13]))
df['day'] = df['create_order_time'].apply(lambda x:int(x[8:10]))
df['month'] = df['create_order_time'].apply(lambda x:int(x[5:7]))
df['year'] = df['create_order_time'].apply(lambda x:int(x[0:4]))
df['date'] = (df['month'].values - 7) * 31 + df['day']
del df['create_order_time']
return df
train = get_preprocessing(train)
test = get_preprocessing(test)
当缺失值比例占到40%或以上,建议删去这个特征,反而可以提高效果
在做一场比赛之前,一般都要先找几个相关的比赛熟悉一下套路
时间序列一定会用到滑窗