K-Means算法对航空公司客户进行分类
1.数据预处理:
import numpy as np
import pandas as pd
data = pd.read_csv("./air_data.csv", encoding='gb18030', engine='python')
print(data.columns)
print(data.shape)
# 保留票价不为空的值
mask1 = data["SUM_YR_1"].notnull() & data["SUM_YR_2"].notnull()
data1 = data.loc[mask1, :]
print(data1.shape)
# 保留票价不为0,平均折扣率不为0,总飞行公里数大于0的记录
mask2 = data["SUM_YR_1"] != 0
mask3 = data["SUM_YR_2"] != 0
mask4 = data["avg_discount"] != 0
mask5 = data["SEG_KM_SUM"] > 0
mask = mask4 & mask5 & (mask2 | mask3)
airline = data1.loc[mask, :]
print(airline.shape)
# 选取需求特征
airline_selection = airline[["FFP_DATE", "LOAD_TIME", "FLIGHT_COUNT", "LAST_TO_END", "avg_discount", "SEG_KM_SUM"]]
# 构建L特征
L = pd.to_datetime(airline_selection["LOAD_TIME"]) - pd.to_datetime(airline_selection["FFP_DATE"])
# 提取数字,由于模型中L单位为:月,所以需要除以30
# apply 对整列数据进行操作
L = L.apply(lambda x: round(int(str(x).split(' ')[0]) / 30, 1))
# 合并特征
airline_features = pd.concat([L, airline_selection.iloc