o2o优惠券预测--经验分享(二)

再次感谢大神们的分享
废话不多说了, 直接切入正题
今日份提取商品的特征

def GetMerchantFeature(feature):
    #提取商品的特征
    #对于数据集
    feature3 = feature
    merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']]

    t = merchant3[['merchant_id']]
    #删除重复行数据
    t.drop_duplicates(inplace=True)

    #显示卖出的商品
    t1 = merchant3[merchant3.date!='null'][['merchant_id']]
    t1['total_sales'] = 1
    #显示每个商品的销售数量
    t1 = t1.groupby('merchant_id').agg('sum').reset_index()


    #显示使用了优惠券消费的商品,正样本
    t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']]
    t2['sales_use_coupon'] = 1
    t2 = t2.groupby('merchant_id').agg('sum').reset_index()


    #显示了商品的优惠券的总数量
    t3 = merchant3[merchant3.coupon_id != 'null'][['merchant_id']]
    t3 ['total_coupon'] = 1
    t3 = t3.groupby('merchant_id').agg('sum').reset_index()

    #显示商品销量和距离的关系
    t4 = merchant3[(merchant3.date != 'null')&(merchant3.coupon_id != 'null')][['merchant_id','distance']]
    #把数据中的null值全部替换为-1
    t4.replace('null',-1,inplace=True)
    t4.distance = t4.distance.astype('int')
    #再把数据中的-1全部替换为NaN
    t4.replace(-1,np.nan,inplace=True)

    #返回用户离商品的距离最小值
    t5 = t4.groupby('merchant_id').agg('min').reset_index()
    t5.rename(columns={'distance':'merchant_min_distance'},inplace = True)

    #返回用户离商品的距离最大值
    t6 = t4.groupby('merchant_id').agg('max').reset_index()
    t6.rename(columns={'distance':'merchant_max_distance'},inplace = True)
    #print(t6)

    #返回距离的平均值
    t7 = t4.groupby('merchant_id').agg('mean').reset_index()
    t7.rename(columns = {'distance':'merchant_mean_distance'},inplace= True)

    #返回距离的中位值
    t8 = t4.groupby('merchant_id').agg('median').reset_index()
    t8.rename(columns={'distance':'merchant_median_distance'},inplace = True)

    merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left')

    #将数据中的NaN用0来替换
    merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0)
    #即优惠券的使用率
    merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float')/merchant3_feature.total_coupon
    #即卖出商品中使用优惠券的占比
    merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales
    #将数据中的NaN用0来替换
    merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0)
    return merchant3_feature

用户的相关特征

def get_user_date_datereceived_gap(s):
    s = s.split(':')
    return (date(int(s[0][0:4]), int(s[0][4:6]), int(s[0][6:8])) - date(int(s[1][0:4]), int(s[1][4:6]), int(s[1][6:8]))).days
def GetUserRelateInfo(feature):
    
    feature3 = feature
    user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]

    t = user3[['user_id']]
    #去掉数据中重复的用户Id
    t.drop_duplicates(inplace=True)

    #用户购买商品的种类数
    t1 = user3[user3.date!='null'][['user_id','merchant_id']]
    #同样去掉重复用的用户id和商品id
    t1.drop_duplicates(inplace=True)
    t1.merchant_id = 1
    t1 = t1.groupby('user_id').agg('sum').reset_index()
    t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)


    #使用了优惠券购买商品的用户id和距离
    t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']]
    #将null值替换为-1
    t2.replace('null',-1,inplace=True)
    t2.distance = t2.distance.astype('int')#转换数据类型为int
    t2.replace(-1,np.nan,inplace=True)

    #得到使用优惠券购买商品的用户离店铺的最短距离
    t3 = t2.groupby('user_id').agg('min').reset_index()
    t3.rename(columns={'distance':'user_min_distance'},inplace=True)

    #得到最大距离
    t4 = t2.groupby('user_id').agg('max').reset_index()
    t4.rename(columns={'distance':'user_max_distance'},inplace=True)

    #得到平均距离
    t5 = t2.groupby('user_id').agg('mean').reset_index()
    t5.rename(columns={'distance':'user_mean_distance'},inplace=True)

    #得到中间距离
    t6 = t2.groupby('user_id').agg('median').reset_index()
    t6.rename(columns={'distance':'user_median_distance'},inplace=True)

    #每个用户使用优惠券购买的物品数量
    t7 = user3[(user3.date != 'null')&(user3.coupon_id != 'null')][['user_id']]
    t7['buy_use_coupon'] = 1
    t7 = t7.groupby('user_id').agg('sum').reset_index()

    #购买物品的总数
    t8 = user3[user3.date != 'null'][['user_id']]
    t8['buy_total'] = 1
    t8 = t8.groupby('user_id').agg('sum').reset_index()

    #接受的优惠券的总数
    t9 = user3[user3.coupon_id != 'null'][['user_id']]
    t9['coupon_received'] = 1
    t9 = t9.groupby('user_id').agg('sum').reset_index()

    #接受到优惠券的日期和使用之间的间隔
    t10 = user3[(user3.date_received != 'null')&(user3.date != 'null')][['user_id','date_received','date']]
    t10['user_date_datereceived_gap'] = t10.date + ':'+ t10.date_received
    t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
    t10 = t10[['user_id','user_date_datereceived_gap']]

    #将用户优惠券使用时间的间隔取平均数
    t11 = t10.groupby('user_id').agg('mean').reset_index()
    t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)

    #间隔天数的最小值
    t12 = t10.groupby('user_id').agg('min').reset_index()
    t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)

    #间隔天数的最大值
    t13 = t10.groupby('user_id').agg('max').reset_index()
    t13.rename(columns={'user_date_datareceived_gap':'max_user_date_datereceived_gap'},inplace=True)

    #将提取的特征合并
    user3_feature = pd.merge(t,t1,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left')

    user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0)
    user3_feature.buy_user_coupon = user3_feature.buy_use_coupon.replace(np.nan,0)
    user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.buy_total.astype('float')#使用优惠券购买的商品占总数的多少
    user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.coupon_received.astype('float')
    user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0)#将数据中的NaN值转为0
    user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0)
    return user3_feature
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值