一、数据探索分析
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
#查询空值、最大值、最小值
path='C:\\Users\\11728\\Desktop\\文件\\python\\《Python数据分析与挖掘实战》-数据\\chapter7\\demo\\data\\air_data.csv'
air_data=pd.read_csv(path,encoding='utf-8')
explore=air_data.describe(percentiles=[],include='all').T
explore['null']=len(air_data)-explore['count']
explore=explore[['null','max','min']]
explore.columns=[[u'空值数',u'最大值',u'最小值']]
print(explore)
#结果
空值数 最大值 最小值
MEMBER_NO 0 62988 1
FFP_DATE 0 NaN NaN
FIRST_FLIGHT_DATE 0 NaN NaN
GENDER 3 NaN NaN
FFP_TIER 0 6 4
WORK_CITY 2269 NaN NaN
WORK_PROVINCE 3248 NaN NaN
WORK_COUNTRY 26 NaN NaN
AGE 420 110 6
LOAD_TIME 0 NaN NaN
FLIGHT_COUNT 0 213 2
BP_SUM 0 505308 0
EP_SUM_YR_1 0 0 0
EP_SUM_YR_2 0 74460 0
SUM_YR_1 551 239560 0
SUM_YR_2 138 234188 0
SEG_KM_SUM 0 580717 368
WEIGHTED_SEG_KM 0 558440 0
LAST_FLIGHT_DATE 0 NaN NaN
AVG_FLIGHT_COUNT 0 26.625 0.25
AVG_BP_SUM 0 63163.5 0
BEGIN_TO_FIRST 0 729 0
LAST_TO_END 0 731 1
AVG_INTERVAL 0 728 0
MAX_INTERVAL 0 728 0
ADD_POINTS_SUM_YR_1 0 600000 0
ADD_POINTS_SUM_YR_2 0 728282 0
EXCHANGE_COUNT 0 46 0
avg_discount 0 1.5 0
P1Y_Flight_Count 0 118 0
L1Y_Flight_Count 0 111 0
P1Y_BP_SUM 0 246197 0
L1Y_BP_SUM 0 259111 0
EP_SUM 0 74460 0
ADD_Point_SUM 0 984938 0
Eli_Add_Point_Sum 0 984938 0
L1Y_ELi_Add_Points 0 728282 0
Points_Sum 0 985572 0
L1Y_Points_Sum 0 728282 0
Ration_L1Y_Flight_Count 0 1 0
Ration_P1Y_Flight_Count 0 1 0
Ration_P1Y_BPS 0 0.999989 0
Ration_L1Y_BPS 0 0.999993 0
Point_NotFlight 0 140 0
二、数据预处理
1.数据清洗
path='C:\\Users\\11728\\Desktop\\文件\\python\\《Python数据分析与挖掘实战》-数据\\chapter7\\demo\\data\\air_data.csv'
data=pd.read_csv(path,encoding='utf-8')
data=data[data['SUM_YR_1'].notnull()&data['SUM_YR_2'].notnull()] #第一、二年票价非空
condition1=data['SUM_YR_1']!=0
condition2=data['SUM_YR_2']!=0 #第一、二年票价不等于0
condition3=(data['SEG_KM_SUM']==0)&(data['avg_discount']==0) #平均折扣与总飞行公里为0
data=data[condition1|condition2|condition3]
print(data)
2.数据规约
data=data[['LOAD_TIME','FFP_DATE','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]
3.数据变换
#datetime.datetime.strptime(row['LOAD_TIME'], "%Y/%m/%d") 字符串转化为日期
data['L']=data.apply(lambda row:(datetime.datetime.strptime(row['LOAD_TIME'], "%Y/%m/%d")-datetime.datetime.strptime(row['FFP_DATE'], "%Y/%m/%d")).days,axis=1) #入会员书时间
data=data.drop(['LOAD_TIME','FFP_DATE'],axis=1) #删除
data.columns=['R','F','M','C','L']
data_zs=(data-data.mean(axis=0))/(data.std(axis=0)) #数据标准化
data_zs.columns=['Z'+i for i in data.columns]
print(data_zs)
#结果
ZR ZF ZM ZC ZL
0 -0.944948 14.034016 26.761154 1.295540 1.435707
1 -0.911894 9.073213 13.126864 2.868176 1.307152
2 -0.889859 8.718869 12.653481 2.880950 1.328381
3 -0.416098 0.781585 12.540622 1.994714 0.658476
4 -0.922912 9.923636 13.898736 1.344335 0.386032
三、模型构建
from sklearn.cluster import KMeans
k=5
kmodel=KMeans(n_clusters=k,n_jobs=2)
kmodel.fit(data_zs)
cen=kmodel.cluster_centers_ #聚类中心
kinds=kmodel.labels_ #聚类标签
#聚类结果:
[[-0.41487223 -0.16115608 -0.16097537 -0.25507251 -0.70022909 -0.41487223]
[-0.79938326 2.4832016 2.42472391 0.30863003 0.48332845 -0.79938326]
[-0.00299761 -0.22662501 -0.23103958 2.19174737 0.05220837 -0.00299761]
[-0.37722119 -0.08691852 -0.09484404 -0.1559046 1.16066672 -0.37722119]
[ 1.68623455 -0.57402693 -0.53683869 -0.17318782 -0.31366659 1.68623455]]
四、雷达图
labels=data_zs.columns
n=len(labels) #n=5
cen=np.c_[cen,cen[:,0]] #首尾列相同,数据闭合
angles=np.linspace(0,2*np.pi,n,endpoint=False)
angles=np.append(angles,angles[0]) #角度闭合
index=['c1','c2','c3','c4','c5']
print(cen)
fig = plt.figure()
su = fig.add_subplot(111, polar=True)# polar参数!!
for i in range(len(index)):
su.plot(angles,cen[i], linewidth=2,label=index[i])# 画线
su.set_thetagrids(angles * 180/np.pi, labels, fontproperties="SimHei") 添加标签
plt.legend(loc='lower right') #添加图列
plt.show()