#-*- coding: utf-8 -*-
#提取日历史数据
import pandas as pd
import numpy as np
import pymysql
import time
import datetime
from sqlalchemy import create_engine
import sklearn
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import graphviz
import pandas_profiling as pp
import seaborn as sns
import matplotlib.pyplot as plt
dbconn = pymysql.connect(
host="127.0.0.1",
database="work",
user="root",
password="111111",
port=3306,
charset='utf8')
connt = create_engine('mysql+mysqldb://root:111111@localhost:3306/work?charset=utf8')
# 数据,来自数据库,指定日期第一段
sqlcmd = "select * from work.appbehavior"
df1 = pd.read_sql(sqlcmd, dbconn)
mpg = pd.read_sql(sqlcmd, dbconn)
#变量之间的相关性
t2 = pd.DataFrame(mpg.corr())
t2.to_excel("corr.xls")
#指定数据字段
x=mpg[[
'm_count',
'chezhu_c',
'souuo_c',
'jiange_max',
'time_average',
'licai_cishu_ration',
'daikuan_cishu_ratio',
'xiaofei_cishu_ratio',
'fuwu_c_cishu_ratio',
'fuwu_d_cishu_ratio',
'quanyi_cishu_ratio',
'banli_c',
'yewubanli_cishu_ratio',
'age',
'car',
'production_licai_amount',
'lum_flag',
'lum_c_flag'
]]
#标准化
from sklearn.preprocessing import scale
x=pd.DataFrame(scale(x))#scale为标准化数据语句
#找最佳聚类个数
best = []
for i in range(1,10):
kmeans = cluster.KMeans(n_clusters=i,init='k-means++',random_state=42)
kmeans.fit(x)
best.append(kmeans.inertia_)
plt.plot(range(1,10),best)
plt.title('The best cluster')
plt.xlabel('cluster number')
plt.ylabel('sse')
plt.show()
#kmeans聚类
from sklearn import cluster
model = cluster.KMeans(n_clusters=5,random_state=10)
model.fit(x)
#评估一下聚类
from sklearn import metrics
x_cluster=model.fit_predict(x)
score=metrics.silhouette_score(x,x_cluster)
print('聚类个数为5时,轮廓函数:',score)
#各个类的百分比
mpg['cluster'] = model.labels_
mpg['cluster'].value_counts(1)
#求中心点
pd.DataFrame(model.cluster_centers_)
centers.to_excel("center")
#修改后的完整版
#!/usr/bin/env python
# coding: utf-8
# In[1]:
#-*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import pymysql
import time
import datetime
from sqlalchemy import create_engine
import sklearn
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import graphviz
import pandas_profiling as pp
import seaborn as sns
import matplotlib.pyplot as plt
dbconn = pymysql.connect(
host="127.0.0.1",
database="A_HISDATA",
user="root",
password="111111",
port=3306,
charset='utf8')
connt = create_engine('mysql+mysqldb://root:111111@localhost:3306/A_HISDATA?charset=utf8')
# 数据,来自数据库,指定日期第一段
sqlcmd = "select * from work.appcluster"
df1 = pd.read_sql(sqlcmd, dbconn)
mpg = pd.read_sql(sqlcmd, dbconn)
#mpg = mpg.drop('level_0',axis=1)
mpg.head()
print(mpg.isnull().sum()/len(mpg))
# In[3]:
#选取有用变量,建立模型
x=mpg[[
'm_count',
'jiange_max',
'licai_cishu_ration',
'daikuan_cishu_ratio',
'xiaofei_cishu_ratio',
'yewubanli_cishu_ratio',
'gender',
'daifa',
'age',
'production_licai_amount',
'lum_flag',
'lum_c_flag',
'card_category',
]
]
# In[4]:
#聚类必须标准化
from sklearn.preprocessing import scale
x=pd.DataFrame(scale(x))
# In[5]:
#建立模型,best是5,但是为了拆分,先建立7个,以便观察及合并
from sklearn import cluster
model = cluster.KMeans(n_clusters=7,random_state=10)
# In[6]:
#应用
model.fit(x)
# In[7]:
#给原来的数据加上分类的标签
mpg['cluster'] = model.labels_
# In[8]:
print(mpg.head())
# In[ ]:
#评估一下聚类的轮廓系数,这个不太重要
from sklearn import metrics
x_cluster=model.fit_predict(x)
score=metrics.silhouette_score(x,x_cluster)
print('聚类个数为5时,轮廓函数:',score)
# In[ ]:
#每个类的百分比
mpg['cluster'].value_counts(1)
# In[ ]:
#提取指定的几个类进行分析
mpg = mpg.loc[mpg['cluster'] =='' |mpg['cluster'] =='' |mpg['cluster']]==''
# In[ ]:
#分类后某个类的某个变量分析
mpg.groupby(['cluster']).gender.describe()#聚类后的变量分析
# In[ ]:
#求中心点
# centers = pd.DataFrame(model.cluster_centers_)
# centers.to_excel('zhongxindian.xls')
# mpg['cluster'].value_counts(1)
# In[ ]:
#找最佳聚类个数
# best = []
# for i in range(1,10):
# kmeans = cluster.KMeans(n_clusters=i,init='k-means++',random_state=42)
# kmeans.fit(x)
# best.append(kmeans.inertia_)
# plt.plot(range(1,10),best)
# plt.title('The best cluster')
# plt.xlabel('cluster number')
# plt.ylabel('sse')
# plt.show()
# In[ ]:
# #结果存储到数据库
# mpg.to_sql(name='app_c12',schema=dbconn)
本文通过Python实现,从数据库中提取日历史数据,使用K-means算法对用户行为特征进行聚类分析,探讨最佳聚类数量并解读各群体特性,有助于业务洞察和用户画像构建。
606

被折叠的 条评论
为什么被折叠?



