聚类算法,并计算轮廓系数得分
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 23 09:33:11 2017
@author: Administrator
"""
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import MySQLdb
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import metrics
#from sqlalchemy import create_engine
#import matplotlib.pyplot as plt
#import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体
plt.rcParams['axes.unicode_minus'] = False #解决保存图像是负号'-'显示为方块的问题
# 连接本地服务器
conn = MySQLdb.connect(host="192.168.1.77", port=3306, user="root", passwd="", db="jr_project", charset='utf8')
cursor = conn.cursor()
#获取整个sql的数据,并存入df中
#方案一,在这里写sql代码,获取的数据存入df中
sql1=('SELECT * from jr_analysis_new')
#方案二
#test=open(r'C:\\Users\\Administrator.WIN7U-20160229G\\Desktop\\jr_project\\jr_project.txt') #通过txt获取sql代码
#sql=test.read().decode('gbk')
#sql.replace('\n',' ')
df = pd.read_sql(sql1,conn,index_col=None) #因为不确定有没有加索引,所以设置none
#df=df.loc[:, ['main_id','id','活期总额','定期金额加权求和','total_benifit','最大投资金额']]
dataframe=df[['gap','current_amount_new','benefit','investment','date']].fillna(0).apply(lambda x: (x-np.mean(x))/(np.std(x))) #用于对参考维度的指标进行标准差标准化
kmodel=KMeans(n_clusters=10, random_state=0).fit(dataframe)#使用内置的kmeans算法,获取kmodel
centerpoint=kmodel.cluster_centers_#计算聚类中心点
typelist=kmodel.labels_#获取所有项目的分类标签
fig = plt.figure() #用于绘制3d图形
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df[[5]], df[[6]], df[[7]], c=typelist)#取3个维度取描绘图形
ax.set_xlabel(u'类别1')
ax.set_ylabel(u'类别2')
ax.set_zlabel(u'类别3')
plt.title(u"'鲸鱼项目'")
plt.show()
#用于计算轮廓系数,获取聚类的得分
#file = open("C:\\Users\\Administrator.WIN7U-20160229G\\Desktop\\jr_project\\jr_project.txt", 'w+')
#file.write(metrics.silhouette_score(dataframe, typelist, metric='euclidean'))
#file.close()
file = open("C:\\Users\\Administrator.WIN7U-20160229G\\Desktop\\jr_project\\jr_project.txt", 'w+')
score=metrics.silhouette_score(dataframe, typelist, metric='euclidean')
file.write(str(score))
file.close()
#df1=pd.concat([df, dataframe], axis=1)
#df1=df[df.cumsum()<=38473208.0]
#用于将数据写入mysql数据库
#yconnect = create_engine('mysql+mysqldb://root:@localhost:3306/jr_project?charset=utf8')
#pd.io.sql.to_sql(df,'1652_python_all', yconnect, schema='jr_project', if_exists='append', index=False)
#df=df.set_index('id') #去掉首列数字
#对df进行操作
#df1 = df.describe()
typelist=pd.DataFrame([typelist])
typelist=typelist.T
df[r'类别']=typelist #把结果链接到原df中,因为我希望存储的时候左边是数据,右边是分类标签
centerpoint=pd.DataFrame(centerpoint)
#将结果存入excel中
outputfile="C:\\Users\\Administrator.WIN7U-20160229G\\Desktop\\jr_overcome.xlsx"
df.to_excel(outputfile)
outputfile1="C:\\Users\\Administrator.WIN7U-20160229G\\Desktop\\jr_centerpoint.xlsx"
centerpoint.to_excel(outputfile1)
# 关闭连接
cursor.close()
conn.close()
##绘制df的图形
#sns.set(style="white", palette="muted", color_codes=True)
#ax1=sns.kdeplot(df['年龄'],color='r')