#-*- coding: utf-8 -*-
#提取日历史数据
import pandas as pd
import numpy as np
import pymysql
import time
import datetime
from sqlalchemy import create_engine
import sklearn
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import graphviz
import pandas_profiling as pp
import seaborn as sns
import matplotlib.pyplot as plt
dbconn = pymysql.connect(
host="127.0.0.1",
database="work",
user="root",
password="111111",
port=3306,
charset='utf8')
connt = create_engine('mysql+mysqldb://root:111111@localhost:3306/work?charset=utf8')
# 数据,来自数据库,指定日期第一段
sqlcmd = "select * from work.appbehavior"
df1 = pd.read_sql(sqlcmd, dbconn)
mpg = pd.read_sql(sqlcmd, dbconn)
#变量之间的相关性
t2 = pd.DataFrame(mpg.corr())
t2.to_excel("corr.xls")
#指定数据字段
x=mpg[[
'm_count',
'chezhu_c',
'souuo_c',
'jiange_max',
'time_average',
'licai_cishu_ration',
'daikuan_cishu_ratio',
'xiaofei_cishu_ratio',
'fuwu_c_cishu_ratio',
'fuwu_d_cishu_ratio',
'quanyi_cishu_ratio',
'banli_c',
'yewubanli_cishu_ratio',
'age',
'car',
'production_licai_amount',
'lum_flag',
'lum_c_flag'
]]
#标准化
from sklearn.preprocessing import scale
x=pd.DataFrame(scale(x))#scale为标准化数据语句
#找最佳聚类个数
best = []
for i in range(1,10):
kmeans = cluster.KMeans(n_clusters=i,init='k-means++',random_state=42)
kmeans.fit(x)
best.append(kmeans.inertia_)
plt.plot(range(1,10),best)
plt.title('The best cluster')
plt.xlabel('cluster number')
plt.ylabel('sse')
plt.show()
#kmeans聚类
from sklearn import cluster
model = cluster.KMeans(n_clusters=5,random_state=10)
model.fit(x)
#评估一下聚类
from sklearn import metrics
x_cluster=model.fit_predict(x)
score=metrics.silhouette_score(x,x_cluster)
print('聚类个数为5时,轮廓函数:',score)
#各个类的百分比
mpg['cluster'] = model.labels_
mpg['cluster'].value_counts(1)
#求中心点
pd.DataFrame(model.cluster_centers_)
centers.to_excel("center")
#修改后的完整版
#!/usr/bin/env python # coding: utf-8 # In[1]: #-*- coding: utf-8 -*- import pandas as pd import numpy as np import pymysql import time import datetime from sqlalchemy import create_engine import sklearn from sklearn.tree import DecisionTreeClassifier as DT from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn import tree import graphviz import pandas_profiling as pp import seaborn as sns import matplotlib.pyplot as plt dbconn = pymysql.connect( host="127.0.0.1", database="A_HISDATA", user="root", password="111111", port=3306, charset='utf8') connt = create_engine('mysql+mysqldb://root:111111@localhost:3306/A_HISDATA?charset=utf8') # 数据,来自数据库,指定日期第一段 sqlcmd = "select * from work.appcluster" df1 = pd.read_sql(sqlcmd, dbconn) mpg = pd.read_sql(sqlcmd, dbconn) #mpg = mpg.drop('level_0',axis=1) mpg.head() print(mpg.isnull().sum()/len(mpg)) # In[3]: #选取有用变量,建立模型 x=mpg[[ 'm_count', 'jiange_max', 'licai_cishu_ration', 'daikuan_cishu_ratio', 'xiaofei_cishu_ratio', 'yewubanli_cishu_ratio', 'gender', 'daifa', 'age', 'production_licai_amount', 'lum_flag', 'lum_c_flag', 'card_category', ] ] # In[4]: #聚类必须标准化 from sklearn.preprocessing import scale x=pd.DataFrame(scale(x)) # In[5]: #建立模型,best是5,但是为了拆分,先建立7个,以便观察及合并 from sklearn import cluster model = cluster.KMeans(n_clusters=7,random_state=10) # In[6]: #应用 model.fit(x) # In[7]: #给原来的数据加上分类的标签 mpg['cluster'] = model.labels_ # In[8]: print(mpg.head()) # In[ ]: #评估一下聚类的轮廓系数,这个不太重要 from sklearn import metrics x_cluster=model.fit_predict(x) score=metrics.silhouette_score(x,x_cluster) print('聚类个数为5时,轮廓函数:',score) # In[ ]: #每个类的百分比 mpg['cluster'].value_counts(1) # In[ ]: #提取指定的几个类进行分析 mpg = mpg.loc[mpg['cluster'] =='' |mpg['cluster'] =='' |mpg['cluster']]=='' # In[ ]: #分类后某个类的某个变量分析 mpg.groupby(['cluster']).gender.describe()#聚类后的变量分析 # In[ ]: #求中心点 # centers = pd.DataFrame(model.cluster_centers_) # centers.to_excel('zhongxindian.xls') # mpg['cluster'].value_counts(1) # In[ ]: #找最佳聚类个数 # best = [] # for i in range(1,10): # kmeans = cluster.KMeans(n_clusters=i,init='k-means++',random_state=42) # kmeans.fit(x) # best.append(kmeans.inertia_) # plt.plot(range(1,10),best) # plt.title('The best cluster') # plt.xlabel('cluster number') # plt.ylabel('sse') # plt.show() # In[ ]: # #结果存储到数据库 # mpg.to_sql(name='app_c12',schema=dbconn)