APP聚类分析

本文通过Python实现,从数据库中提取日历史数据,使用K-means算法对用户行为特征进行聚类分析,探讨最佳聚类数量并解读各群体特性,有助于业务洞察和用户画像构建。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#-*- coding: utf-8 -*-

#提取日历史数据
import pandas as pd
import numpy as np
import pymysql
import time
import datetime
from sqlalchemy import create_engine
import sklearn
from sklearn.tree import DecisionTreeClassifier as  DT
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import graphviz
import pandas_profiling as pp
import seaborn as sns
import matplotlib.pyplot  as plt

dbconn = pymysql.connect(
        host="127.0.0.1",
        database="work",
        user="root",
        password="111111",
        port=3306,
        charset='utf8')
connt = create_engine('mysql+mysqldb://root:111111@localhost:3306/work?charset=utf8')

 

#  数据,来自数据库,指定日期第一段
sqlcmd = "select * from work.appbehavior"
df1 = pd.read_sql(sqlcmd, dbconn)
mpg = pd.read_sql(sqlcmd, dbconn)

 

#变量之间的相关性
t2 = pd.DataFrame(mpg.corr())
t2.to_excel("corr.xls")

#指定数据字段

x=mpg[[
'm_count',
'chezhu_c',
'souuo_c',
'jiange_max',
'time_average',
'licai_cishu_ration',
'daikuan_cishu_ratio',
'xiaofei_cishu_ratio',
'fuwu_c_cishu_ratio',
'fuwu_d_cishu_ratio',
'quanyi_cishu_ratio',
'banli_c',
'yewubanli_cishu_ratio',
'age',
'car',
'production_licai_amount',
'lum_flag',
'lum_c_flag'
]]

#标准化

from sklearn.preprocessing import scale
x=pd.DataFrame(scale(x))#scale为标准化数据语句

 

#找最佳聚类个数
best = []
for i in range(1,10):
        kmeans = cluster.KMeans(n_clusters=i,init='k-means++',random_state=42)
        kmeans.fit(x)
        best.append(kmeans.inertia_)

plt.plot(range(1,10),best)
plt.title('The best cluster')
plt.xlabel('cluster number')
plt.ylabel('sse')
plt.show()

 

#kmeans聚类
from sklearn import cluster
model = cluster.KMeans(n_clusters=5,random_state=10)
model.fit(x)
#评估一下聚类
from sklearn import metrics
x_cluster=model.fit_predict(x)
score=metrics.silhouette_score(x,x_cluster)
print('聚类个数为5时,轮廓函数:',score)

#各个类的百分比
mpg['cluster'] = model.labels_
mpg['cluster'].value_counts(1)

 

#求中心点
pd.DataFrame(model.cluster_centers_)
centers.to_excel("center")

 

#修改后的完整版

#!/usr/bin/env python
# coding: utf-8

# In[1]:


#-*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import pymysql
import time
import datetime
from sqlalchemy import create_engine
import sklearn
from sklearn.tree import DecisionTreeClassifier as  DT
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import graphviz
import pandas_profiling as pp
import seaborn as sns
import matplotlib.pyplot  as plt

dbconn = pymysql.connect(
        host="127.0.0.1",
        database="A_HISDATA",
        user="root",
        password="111111",
        port=3306,
        charset='utf8')
connt = create_engine('mysql+mysqldb://root:111111@localhost:3306/A_HISDATA?charset=utf8')

#  数据,来自数据库,指定日期第一段
sqlcmd = "select * from work.appcluster"
df1 = pd.read_sql(sqlcmd, dbconn)
mpg = pd.read_sql(sqlcmd, dbconn)
#mpg = mpg.drop('level_0',axis=1)
mpg.head()

print(mpg.isnull().sum()/len(mpg))


# In[3]:


#选取有用变量,建立模型
x=mpg[[
'm_count',
'jiange_max',
'licai_cishu_ration',
'daikuan_cishu_ratio',
'xiaofei_cishu_ratio',
'yewubanli_cishu_ratio',
'gender',
'daifa',
'age',
'production_licai_amount',
'lum_flag',
'lum_c_flag',
'card_category',
]
]


# In[4]:


#聚类必须标准化
from sklearn.preprocessing import scale
x=pd.DataFrame(scale(x))


# In[5]:


#建立模型,best是5,但是为了拆分,先建立7个,以便观察及合并
from sklearn import cluster
model = cluster.KMeans(n_clusters=7,random_state=10)


# In[6]:


#应用
model.fit(x)


# In[7]:


#给原来的数据加上分类的标签
mpg['cluster'] = model.labels_


# In[8]:


print(mpg.head())


# In[ ]:


#评估一下聚类的轮廓系数,这个不太重要
from sklearn import metrics
x_cluster=model.fit_predict(x)
score=metrics.silhouette_score(x,x_cluster)
print('聚类个数为5时,轮廓函数:',score)


# In[ ]:


#每个类的百分比
mpg['cluster'].value_counts(1)


# In[ ]:


#提取指定的几个类进行分析
mpg = mpg.loc[mpg['cluster'] =='' |mpg['cluster'] =='' |mpg['cluster']]==''


# In[ ]:


#分类后某个类的某个变量分析
mpg.groupby(['cluster']).gender.describe()#聚类后的变量分析


# In[ ]:


#求中心点
# centers = pd.DataFrame(model.cluster_centers_)
# centers.to_excel('zhongxindian.xls')
# mpg['cluster'].value_counts(1)


# In[ ]:


#找最佳聚类个数
# best = []
# for i in range(1,10):
#         kmeans = cluster.KMeans(n_clusters=i,init='k-means++',random_state=42)
#         kmeans.fit(x)
#         best.append(kmeans.inertia_)

# plt.plot(range(1,10),best)
# plt.title('The best cluster')
# plt.xlabel('cluster number')
# plt.ylabel('sse')
# plt.show()


# In[ ]:


# #结果存储到数据库
# mpg.to_sql(name='app_c12',schema=dbconn)

 

 

 

 

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值