写下来以便自己记忆。
描述性统计方法:首先判断变量的类型,一个分类变量计算统计量、频次value_counts,用直方图;两个分类标准化堆叠柱形图crosstab,统计检验用卡方检验;一个分类一个连续变量用groupby分类盒须图boxplot,统计检验用两样本T检验(多分类则用方差分析);两个连续变量pivot透视表散点图,统计检验用相关分析(注意相关分析和回归分析不同,相关分析用来确认变量是否有关系,回归分析是已经确认有关系后再确认变量间是什么函数关系).
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
os.chdir(r'F:\Download\中文名')
#文件名中有中文,不能直接读取,需要分两步
trad_flow=pd.read_csv("RFM_TRAD_FLOW.csv",encoding='gbk')
trad_flow.head(10)
F=trad_flow.groupby(['cumid','type'])[['transID']].count()
F.head()
#%%透视图 两个分类一个连续
F_trans=pd.pivot_table(F,index='cumid',columns='type',values='transID')
F_trans.head()
#%%缺失值处理
F_trans['Special_offer']=F_trans['Special_offer'].fillna(0)
F_trans["interest"]=F_trans['Special_offer']/(F_trans['Special_offer']+F_trans['Normal'])
F_trans.head()
#%%
M=trad_flow.groupby(['cumid','type'])[['amount']].sum()
M.head()
#%%
M_trans=pd.pivot_table(M,index='cumid',columns='type',values='amount')
M_trans.head()
#%%
M_trans['Special_offer']=M_trans['Special_offer'].fillna(0)
M_trans['returned_goods']=M_trans['returned_goods'].fillna(0)
M_trans["value"]=M_trans['Normal']+M_trans['Special_offer']+M_trans['returned_goods']
M_trans.head()
#%%标准化时间
from datetime import datetime
import time
def to_time(t):
out_t=time.mktime(time.strptime(t,'%d%b%y:%H:%M:%S'))
return out_t
a="14JUN09:17:58:34"
print(to_time(a))
#%%
trad_flow["time_new"]=trad_flow.time.apply(to_time)
trad_flow.head()
#%%df['A'] = df['A'].apply(str.upper)
R=trad_flow.groupby(['cumid'])[['time_new']].max()
R.head()
#%%sklearn预处理 等深分箱
from sklearn import preprocessing
threshold=pd.qcut(F_trans['interest'],2,retbins=True)[1][1]
#%%
#)# 返回每个数对应的分组,且额外返回bins,即每个边界值 二分值数组
binarizer=preprocessing.Binarizer(threshold=threshold)
interest_q=pd.DataFrame(binarizer.transform(F_trans['interest'].values.reshape(-1,1)))
interest_q.index=F_trans.index
interest_q.columns=["interest"]
interest_q
#%%
threshold=pd.qcut(M_trans['value'],2,retbins=True)[1][1]
binarizer = preprocessing.Binarizer(threshold=threshold)
value_q = pd.DataFrame(binarizer.transform(M_trans['value'].values.reshape(-1,1)))
value_q.index=M_trans.index
value_q.columns=["value"]
value_q
#%%
threshold = pd.qcut(R["time_new"], 2, retbins=True)[1][1]
binarizer = preprocessing.Binarizer(threshold=threshold)
time_new_q = pd.DataFrame(binarizer.transform(R["time_new"].values.reshape(-1,1)))
time_new_q.index = R.index
time_new_q.columns = ["time"]
time_new_q
#%%标签
analysis=pd.concat([interest_q,value_q,time_new_q],axis=1)
analysis.head()
label = {(0,0,0):'无兴趣-低价值-沉默',
(1,0,0):'有兴趣-低价值-沉默',
(1,0,1):'有兴趣-低价值-活跃',
(0,0,1):'无兴趣-低价值-活跃',
(0,1,0):'无兴趣-高价值-沉默',
(1,1,0):'有兴趣-高价值-沉默',
(1,1,1):'有兴趣-高价值-活跃',
(0,1,1):'无兴趣-高价值-活跃'
}
analysis['label'] = analysis[['interest','value','time']].apply(lambda x:label[(x[0],x[1],x[2])],axis=1)
#analysis = analysis[['interest','value','time']]
analysis.head()
#%%