section8

### 本章节的目的是 【明确目标用户群】 ,以更好的服务现有用户。 ### 【知识点】 ### 1.作图 - 显示中文 ` plt.rcParams[‘font.sans-serif’] = [‘SimHei’] # 步骤一(替换sans-serif字体) plt.rcParams[‘axes.unicode_minus’] = False # 步骤二(解决坐标轴负数的负号显示问题)` ### 2.数据库操作 - sqlalchemy 引擎 ` engine = create_engine(‘mysql+pymysql://root:123456@localhost:3306/datascience’) ` ### 3.批量读取文件 - os.wolk()、os.path.join()用法
for root, dirs, files in os.walk(path): 
        for file in files:`
            rfile = os.path.join(root,file)
            if rfile.split('.')[-1] == 'tsv':
                rdf = pd.read_csv(rfile, sep='\t')
                df = df.append(rdf)
### 4.groupby()以及agg() 的联合使用,应对不同列使用不同的函数 - 按月统计 ` affc = {‘payment’:’sum’, ‘log_date’:’count’} dfm = df.groupby([‘log_month’, ‘user_id’]).agg(affc).reset_index() ` - 修改列明 ` renam = {‘log_date’:’access_days’} dfm.rename(columns=renam, inplace=True) ` ### 5.KMeans 聚类的使用 - 单列的聚类(需要将单列应用 reshape(-1,1)格式化为1列) `from sklearn.cluster import KMeans a47 = action[‘A47’].reshape(-1, 1) kms = KMeans(n_clusters=3).fit(a47)` - 聚类的标签 labels_ 属性 `cluster = kms.labels_` - 将标签添加至源数据中,运用groupby()查看分组情况 `action[‘cluster’] = cluster action.groupby([‘cluster’])[‘user_id’].count()` - 可视化分组
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
plt.figure(figsize=(8,5))
snsdf1 = snsdf.reset_index()
snsdf1[snsdf1['cluster']==2]['A47'].plot(color='r',label='2:重度用户')
snsdf1[snsdf1['cluster']==1]['A47'].plot(color='g',label='1:中度用户')
snsdf1[snsdf1['cluster']==0]['A47'].plot(color='b',label='0:轻度用户')
plt.legend()
plt.xlabel('用户分布')
plt.ylabel('排行榜得分')
### 6.主成分分析 - 数据预处理 - - 提取要进行主成分分析的列 `paction = acc.iloc[:,3:(len(acc.columns)-1)]` - - 删掉0值较多的列 `cc = paction[paction==0].count(axis=0)/len(paction) cc.plot() dd = cc[cc ### 一、库导入以及matplotlib显示中文
import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os

plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体)
plt.rcParams['axes.unicode_minus'] = False   # 步骤二(解决坐标轴负数的负号显示问题)
%matplotlib inline
数据库引擎
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')
### 二、批量读取文件
def read_files(path):
    df = pd.DataFrame()
    for root, dirs, files in os.walk(path):
        for file in files:
            rfile = os.path.join(root,file)
            if rfile.split('.')[-1] == 'tsv':
                rdf = pd.read_csv(rfile, sep='\t')
                df = df.append(rdf)
    return df
action_path  = 'data/sample-data/section8/daily/action/'
dau_path = 'data/sample-data/section8/daily/dau/'
dpu_path = 'data/sample-data/section8/daily/dpu/'

action = read_files(action_path)
dau = read_files(dau_path)
dpu = read_files(dpu_path)
查看数据完整性以及头部信息
print(action.isnull().sum().sum())
print(action.shape)
# print(action.info())
action.head()
0 (2653, 57)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_dateapp_nameuser_idA1A2A3A4A5A6A7A45A46A47A48A49A50A51A52A53A54
02013-10-31game-016541330000000003802565500000.046
12013-10-31game-01425530000010123319201805433473622400.071
22013-10-31game-017095960000000004162481700000.02
32013-10-31game-0152504702009002222352006412210000.0109
42013-10-31game-01796908000000029293882544410000.064

5 rows × 57 columns

print(dau.isnull().sum().sum())
print(dau.shape)
print(dau.info())
dau.head()
0 (509754, 3)
print(dpu.isnull().sum().sum())
print(dpu.shape)
print(dpu.info())
dpu.head()
0 (3532, 4)
# 写入数据库

# action.to_sql('s8_action', engine, index=False)
# dau.to_sql('s8_dau', engine, index=False)
# dpu.to_sql('s8_dpu', engine, index=False)
## 三、数据预处理 ### 1.合并 DAU DPU
df = pd.merge(dau, dpu[['log_date','user_id','payment']], how='left', on=['user_id','log_date'])
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_dateapp_nameuser_idpayment
02013-05-01game-01608801NaN
12013-05-01game-01712453NaN
22013-05-01game-01776853NaN
32013-05-01game-01823486NaN
42013-05-01game-01113600NaN
# 将无消费记录的消费额设为 0 
print(df.payment.isnull().sum())
df['payment'].fillna(0, inplace=True)
print(df.payment.isnull().sum())
507151 0
# 添加消费额标志位
df['is_pay'] = df['payment'].apply( lambda x: 1 if x>0 else 0 )
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_dateapp_nameuser_idpaymentis_pay
02013-05-01game-016088010.00
12013-05-01game-017124530.00
22013-05-01game-017768530.00
32013-05-01game-018234860.00
42013-05-01game-011136000.00
### 2.按月统计
# 增加月份列
df['log_month'] = df['log_date'].apply(lambda x: x[0:7])
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_dateapp_nameuser_idpaymentis_paylog_month
02013-05-01game-016088010.002013-05
12013-05-01game-017124530.002013-05
22013-05-01game-017768530.002013-05
32013-05-01game-018234860.002013-05
42013-05-01game-011136000.002013-05

巧妙运用 groupby 以及 agg 函数,统计出用户按月份的 消费情况 和 登陆次数

# 按月统计
affc = {'payment':'sum', 'log_date':'count'}
dfm = df.groupby(['log_month', 'user_id']).agg(affc).reset_index()
# 修改列明
renam = {'log_date':'access_days'}
dfm.rename(columns=renam, inplace=True)
dfm.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_monthuser_idpaymentaccess_days
02013-05650.01
12013-051150.01
22013-051940.01
32013-054260.04
42013-055390.01
### 4.使用 Kmeans 进行分类, 得到排名靠前的用户,即 重度用户/中度用户/轻度用户 A47 列即是排行榜得分, 从分布图上看出,大部分用户得分很低,符合幂律曲线
# 
action['A47'].hist(bins=50, figsize=(6,4))
sns.distplot(action['A47'],bins=50,kde=True)
#### 对 A47 列进行聚类,分为3类
from sklearn.cluster import KMeans

a47 = action['A47'].reshape(-1, 1)

kms = KMeans(n_clusters=3).fit(a47)
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(…) instead This is separate from the ipykernel package so we can avoid doing imports until
cluster = kms.labels_
kms.cluster_centers_
array([[ 9359.84787792], [ 69386.11297071], [185857.17948718]])
action['cluster'] = cluster
action.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_dateapp_nameuser_idA1A2A3A4A5A6A7A46A47A48A49A50A51A52A53A54cluster
02013-10-31game-01654133000000003802565500000.0460
12013-10-31game-014255300000101233201805433473622400.0712
22013-10-31game-01709596000000004162481700000.020
32013-10-31game-01525047020090022352006412210000.01090
42013-10-31game-017969080000000293882544410000.0640

5 rows × 58 columns

action.groupby(['cluster'])['user_id'].count()
cluster 0 2096 1 479 2 78 Name: user_id, dtype: int64 图上显示,通过聚类分解后用户分为3个类, 0 表示轻度用户,排行榜得分最少; 1 表示中度用户,排行版得分居中; 2 表示重度用户,排行版得分较高,而且用户数量较少,符合实际情况。
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
snsdf['user'] = range(len(snsdf))
sns.scatterplot(x='user',y='A47',hue='cluster',data=snsdf, palette='rainbow', alpha=.2)
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
snsdf['user'] = range(len(snsdf))

plt.figure(figsize=(8,5))
snsdf1 = snsdf.reset_index()
snsdf1[snsdf1['cluster']==2]['A47'].plot(color='r',label='2:重度用户')
snsdf1[snsdf1['cluster']==1]['A47'].plot(color='g',label='1:中度用户')
snsdf1[snsdf1['cluster']==0]['A47'].plot(color='b',label='0:轻度用户')
plt.legend()
plt.xlabel('用户分布')
plt.ylabel('排行榜得分')
Text(0,0.5,’排行榜得分’) ![png](output_33_1.png) #### 限定排名靠前的用户,即得分较高的重度和中度用户,以便接下来进行分析
acc = action[action['cluster']>=1]
acc.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_dateapp_nameuser_idA1A2A3A4A5A6A7A46A47A48A49A50A51A52A53A54cluster
12013-10-31game-014255300000101233201805433473622400.0712
52013-10-31game-017761200000900381422146843715000.03122
72013-10-31game-012761970000705815546024226150800.0951
82013-10-31game-0122157200001002439891579240000.0211
92013-10-31game-01692433000060028507064549168000.01541

5 rows × 58 columns

## 5.主成分分析 获取关键的参数
paction = acc.iloc[:,3:(len(acc.columns)-1)]
paction.index=acc.user_id
paction.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A1A2A3A4A5A6A7A8A9A10A45A46A47A48A49A50A51A52A53A54
user_id
425530000010123358.2528823019201805433473622400.071
77612000009000.0032519519381422146843715000.0312
276197000070587.251501001515546024226150800.095
22157200001000.004014242439891579240000.021
69243300006000.00102951528507064549168000.0154

5 rows × 54 columns

#### 1.删掉 0 值比较多的列
cc = paction[paction==0].count(axis=0)/len(paction)
print(cc.head())
cc.plot()
A1 1.000000 A2 0.926391 A3 1.000000 A4 0.994614 A5 0.055655 dtype: float64
# cc[cc>.8]
dd = cc[cc<.95]
paction = paction[dd.index]
paction.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2A5A6A7A8A9A10A11A12A13A45A46A47A48A49A50A51A52A53A54
user_id
425530010123358.252882301921919201805433473622400.071
77612009000.003251953881919381422146843715000.0312
276197070587.25150100153111515546024226150800.095
22157201000.004014003242439891579240000.021
69243306000.00102950021528507064549168000.0154

5 rows × 32 columns

#### 2.删掉相关性较强的列
corp = paction.corr()
plt.figure(figsize=(15,8))
sns.heatmap(corp)
mask = np.array(corp)
mask[np.tril_indices_from(mask)] = False
fig,ax = plt.subplots()
fig.set_size_inches(15,8)
sns.heatmap(corp,mask=mask)
coll = corp.columns
corp = pd.DataFrame(np.tril(corp, -1))
corp.columns = coll
corp.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2A5A6A7A8A9A10A11A12A13A45A46A47A48A49A50A51A52A53A54
00.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
10.0697440.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
20.0761850.1788330.0000000.0000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
30.1587350.2193950.3713600.0000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
40.1672000.1861240.2420250.8031610.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0

5 rows × 32 columns

pac2 = paction.loc[:,(corp.abs()<.7).all()]      # 任何一个数都小于0.7 的数据
pac2.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2A11A12A13A20A23A24A43A44A46A48A49A50A51A53A54
user_id
425530019219000.5230.9217420347362240.071
776120038819000.0200.9025638684371500.0312
276197015311000.0100.9200015422615080.095
2215720003000.020.857142457924000.021
6924330002000.0110.7368428454916800.0154
### 进行主成分分析
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(pac2)
PCA(copy=True, iterated_power=’auto’, n_components=None, random_state=None, svd_solver=’auto’, tol=0.0, whiten=False)
redio = pca.explained_variance_ratio_
print(redio) 
print(pca.singular_values_)  
[9.97843804e-01 1.92024564e-03 1.20120771e-04 5.57014208e-05 2.67905481e-05 1.54533752e-05 9.31262940e-06 4.38846214e-06 3.02317261e-06 8.36725295e-07 1.31874979e-07 9.78197162e-08 3.86464536e-08 2.94647596e-08 1.82272465e-08 7.54580333e-09] [3.96183910e+04 1.73797668e+03 4.34684952e+02 2.96004755e+02 2.05284590e+02 1.55911168e+02 1.21032418e+02 8.30848288e+01 6.89599635e+01 3.62791414e+01 1.44027941e+01 1.24044853e+01 7.79687146e+00 6.80796010e+00 5.35458829e+00 3.44523057e+00]
recu = redio.cumsum()
print(recu)
x = np.arange(len(recu))
plt.plot(recu, color='r')
[0.9978438 0.99976405 0.99988417 0.99993987 0.99996666 0.99998212 0.99999143 0.99999582 0.99999884 0.99999968 0.99999981 0.99999991 0.99999994 0.99999997 0.99999999 1. ] [ #### 得到降维后的数据
pca.set_params(n_components=10)
pac3 = pd.DataFrame(pca.fit_transform(pac2))
pacsse = pac3.copy()
pac3.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0123456789
02706.266005-100.824346-1.874787-1.57753612.481591-2.3943209.7708787.8075350.021273-2.169596
12373.811140147.314930-16.386795-8.42865510.019577-3.0047256.0097710.961469-1.5985312.144615
2-1171.733361-5.4930810.7449950.542033-0.785251-5.756412-1.012336-1.7780677.2568840.343277
3-2738.903900-50.4684872.3284912.965415-5.79434711.8912892.965366-1.1824130.0656191.245358
4-1493.64261858.686385-10.80761211.7779737.6646929.3129684.3764291.994214-1.5680500.426246
## 6.KMeans 进行聚类
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5)
km.fit(pac3)
KMeans(algorithm=’auto’, copy_x=True, init=’k-means++’, max_iter=300, n_clusters=5, n_init=10, n_jobs=1, precompute_distances=’auto’, random_state=None, tol=0.0001, verbose=0)
clu = km.labels_
pac3['clu'] = clu
pac3.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0123456789clu
02706.266005-100.824346-1.874787-1.57753612.481591-2.3943209.7708787.8075350.021273-2.1695960
12373.811140147.314930-16.386795-8.42865510.019577-3.0047256.0097710.961469-1.5985312.1446150
2-1171.733361-5.4930810.7449950.542033-0.785251-5.756412-1.012336-1.7780677.2568840.3432771
3-2738.903900-50.4684872.3284912.965415-5.79434711.8912892.965366-1.1824130.0656191.2453584
4-1493.64261858.686385-10.80761211.7779737.6646929.3129684.3764291.994214-1.5680500.4262461
pac3.groupby('clu')[2].count()
clu 0 90 1 113 2 122 3 109 4 123 Name: 2, dtype: int64 #### palette 的颜色风格: Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r
plt.figure(figsize=(13,7))
sns.scatterplot(x=0, y=1, data=pac3,style='clu',hue='clu', palette='autumn')
### 将分类后的类别添加至原数据中
pac4 = pac2.copy()
pac4['cluster'] = list(pac3.clu)
pac4.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2A11A12A13A20A23A24A43A44A46A48A49A50A51A53A54cluster
user_id
425530019219000.5230.9217420347362240.0710
776120038819000.0200.9025638684371500.03120
276197015311000.0100.9200015422615080.0951
2215720003000.020.857142457924000.0214
6924330002000.0110.7368428454916800.01541
# 计算每个类的平均值
clu5 = pac4.groupby('cluster').mean()
# 删除相关性较高的列
clu5.drop(columns='A53',inplace=True)
c5cor = clu5.corr()
plt.figure(figsize=(15,8))
sns.heatmap(c5cor,annot=True)
ccrp = pd.DataFrame(np.tril(c5cor,-1))
ccrp.columns = clu5.columns
cccc = clu5.loc[:,(ccrp.abs()<.95).all()]
cccc
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2A20A23A24A44A46A50A51A54
cluster
00.0222220.3222220.6555560.1676910.85819327.60000010.6666672.011111166.711111
10.0796460.2743360.3628320.0952310.84402720.1592923.0088501.469027102.106195
20.0737700.3770490.3360660.0706280.84934324.7377054.2868851.844262121.909836
30.0183490.2293580.2844040.0982520.84598124.1192665.2660551.733945146.871560
40.2032520.2926830.2439020.0636860.77507618.9837402.1300810.97561084.032520
from sklearn.preprocessing import scale

ccccc = pd.DataFrame(scale(cccc))
ccccc.columns = cccc.columns
ccccc
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2A20A23A24A44A46A50A51A54
0-0.8555900.4688591.9184001.8620200.7858821.4229701.8677731.1184571.424282
10.002962-0.503392-0.094337-0.1049610.315530-0.940402-0.688647-0.381093-0.746672
2-0.0848841.582038-0.278379-0.7728260.4920380.513827-0.2619980.656909-0.081200
3-0.913505-1.416613-0.633601-0.0229440.3803870.3173940.0648790.3517420.757602
41.851016-0.130892-0.912083-0.961289-1.973837-1.313789-0.982007-1.746015-1.354012
plt.figure(figsize=(8,8))
# 极坐标的分割分数
N = ccccc.shape[1]
# 设置雷达图的角度,用于平分切开一个圆面
angles = np.linspace(0, 2*np.pi, N, endpoint=False)
# 使雷达图一圈封闭起来
angles = np.concatenate((angles,[angles[0]]))
for i in range(len(ccccc)):
    # 构造数据
    values = ccccc.loc[i,:]
    # 为了使雷达图一圈封闭起来
    values = np.concatenate((values,[values[0]]))
    # 绘制
    plt.polar(angles, values, 'o-', linewidth=2)
plt.legend(ccccc.index, loc='lower right')
# 添加极坐标的标签
plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns))
plt.title('重要指标雷达图呈现')
Text(0.5,1.05,’重要指标雷达图呈现’) ![png](output_70_1.png) ## 不进行预处理的降维
dfp = acc.iloc[:,3:(len(acc.columns)-1)]
dfp.index=acc.user_id
dfp.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A1A2A3A4A5A6A7A8A9A10A45A46A47A48A49A50A51A52A53A54
user_id
425530000010123358.2528823019201805433473622400.071
77612000009000.0032519519381422146843715000.0312
276197000070587.251501001515546024226150800.095
22157200001000.004014242439891579240000.021
69243300006000.00102951528507064549168000.0154

5 rows × 54 columns

from sklearn.decomposition import PCA

pca = PCA(whiten=False)
pca.fit(dfp)
PCA(copy=True, iterated_power=’auto’, n_components=None, random_state=None, svd_solver=’auto’, tol=0.0, whiten=False)
retio = pca.explained_variance_ratio_
# print(retio) 
# print(pca.singular_values_)  

rec = retio.cumsum()
print(rec)
x = np.arange(len(rec))
plt.plot(rec, color='r')
[0.9996008 0.99995245 0.99997489 0.99999016 0.9999933 0.99999564 0.99999759 0.99999838 0.99999897 0.9999995 0.99999962 0.99999972 0.99999979 0.99999986 0.9999999 0.99999993 0.99999996 0.99999997 0.99999997 0.99999998 0.99999998 0.99999999 0.99999999 0.99999999 0.99999999 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. ] [
pca.set_params(n_components=10)
pacsse = pd.DataFrame(pca.fit_transform(dfp))
pacsse.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0123456789
094938.293061-342.891655-161.442878-199.6162101.83069273.107938153.124982124.440657-34.37161246.548951
156613.313155-960.580156-38.560364-45.83657113.67016690.767620-145.846645-40.25513410.50820316.287863
2-31060.195159388.005529-6.932692-0.948812-5.33272818.23729311.39346714.689011-7.99490932.398532
3-45806.2524431579.357883-81.812845-96.488345-18.477649-90.05921731.377291-22.865193-19.72483716.293640
4-34963.135693611.858506-18.187490-16.454233-5.597209-9.722257-63.112236-3.9432667.222725-10.889839
## 手肘法获取最优 K 值
from sklearn.cluster import KMeans

df_features = pacsse # 读入数据
# '利用SSE选择k'
SSE = []  # 存放每次结果的误差平方和
for k in range(1,9):
    estimator = KMeans(n_clusters=k)  # 构造聚类器
    estimator.fit(df_features)
    SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
[ #### 显然,先标准化数据是不合适的
# 显然,先标准化数据是不合适的

df_features = pd.DataFrame(scale(pacsse)) 

SSE = []  
for k in range(1,9):
    estimator = KMeans(n_clusters=k) 
    estimator.fit(df_features)
    SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
[
km = KMeans(n_clusters=4)
km.fit(pacsse)
clu = km.labels_
pacsse['clu'] = clu
pacsse.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0123456789clu
094938.293061-342.891655-161.442878-199.6162101.83069273.107938153.124982124.440657-34.37161246.5489512
156613.313155-960.580156-38.560364-45.83657113.67016690.767620-145.846645-40.25513410.50820316.2878630
2-31060.195159388.005529-6.932692-0.948812-5.33272818.23729311.39346714.689011-7.99490932.3985321
3-45806.2524431579.357883-81.812845-96.488345-18.477649-90.05921731.377291-22.865193-19.72483716.2936401
4-34963.135693611.858506-18.187490-16.454233-5.597209-9.722257-63.112236-3.9432667.222725-10.8898391
pacsse.groupby('clu')[2].count()
clu 0 153 1 344 2 54 3 6 Name: 2, dtype: int64
plt.figure(figsize=(13,7))
sns.scatterplot(x=0, y=1, data=pacsse,style='clu',hue='clu', palette='autumn')
### 显然,不进行预处理的数据聚类是有问题的, 第一主成分和第二主成分 显然是相关的
pac4 = pac2.copy()
pac4['cluster'] = list(pacsse.clu)
pac4.head()

clu5 = pac4.groupby('cluster').mean()
clu5.drop(columns='A53',inplace=True)
c5cor = clu5.corr()
plt.figure(figsize=(15,8))
sns.heatmap(c5cor,annot=True)
ccrp = pd.DataFrame(np.tril(c5cor,-1))
ccrp.columns = clu5.columns
cccc = clu5.loc[:,(ccrp.abs()<.95).all()]
cccc
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A12A20A51A54
cluster
03.3986930.2287581.810458146.287582
11.9389530.3168601.433140101.531977
24.5925930.4074071.870370169.777778
32.1666670.1666671.666667213.833333
from sklearn.preprocessing import scale

ccccc = pd.DataFrame(scale(cccc))

ccccc.columns = cccc.columns
ccccc
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A12A20A51A54
00.352533-0.5627840.684599-0.285229
1-1.0217050.406288-1.555764-1.388557
21.4765021.4022491.0403380.293858
3-0.807330-1.245753-0.1691731.379928
plt.figure(figsize=(8,8))
# 极坐标的分割分数
N = ccccc.shape[1]
# 设置雷达图的角度,用于平分切开一个圆面
angles = np.linspace(0, 2*np.pi, N, endpoint=False)
# 使雷达图一圈封闭起来
angles = np.concatenate((angles,[angles[0]]))
for i in range(len(ccccc)):
    # 构造数据
    values = ccccc.loc[i,:]
    # 为了使雷达图一圈封闭起来
    values = np.concatenate((values,[values[0]]))
    # 绘制
    plt.polar(angles, values, 'o-', linewidth=2)
plt.legend(ccccc.index, loc='lower right')
# 添加极坐标的标签
plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns))
plt.title('重要指标雷达图呈现')
Text(0.5,1.05,'重要指标雷达图呈现')

png

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值