# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 11:01:41 2018
@author: wenyun.wxw
"""
# coding=utf-8
"""
Created on 2016-01-06 @author: Eastmount
"""
import time
import re
import os
import sys
import codecs
import shutil
import numpy as np
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
#if __name__ == "__main__":
#########################################################################
# 第一步 计算TFIDF
#文档预料 空格连接
corpus = []
#读取预料 一行预料为一个文档
for line in open('分词结果.csv','r',encoding='ansi').readlines():
#print(line)
corpus.append(line.strip())
#print corpus
#time.sleep(1)
#将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer()
#该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()
#第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
#获取词袋模型中的所有词语
words = vectorizer.get_feature_names()
#将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
weight = tfidf.toarray()
# from sklearn.feature_extraction import DictVectorizer
# DV = DictVectorizer(sparse=False)
# weight = DV.fit_transform(tfidf)
# weight = tfidf
# #打印特征向量文本内容
# print('Features length: ' + str(len(words)))
# resName = "分词结果2.csv"
# result = codecs.open(resName, 'w', 'utf-8')
# for j in range(len(words)):
# result.write(words[j] + ' ')
# result.write('\r\n\r\n')
#
# #打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
# for i in range(len(weight)):
# # print(u"-------这里输出第",i,u"类文本的词语tf-idf权重------")
# for j in range(len(words)):
# #print weight[i][j],
# result.write(str(weight[i][j]) + ' ')
# result.write('\r\n\r\n')
#
# result.close()
########################################################################
# 第二步 聚类Kmeans
#---------选K------------------
#手肘法:画sse与K的图,拐点处为最佳K
SSE = [] # 存放每次结果的误差平方和
#K的上限为20
kmax=20
for k in range(1,kmax):
estimator = KMeans(n_clusters=k,init='k-means++') # 构造聚类器
estimator.fit(weight)
SSE.append(estimator.inertia_)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(range(1,kmax),SSE,'o-')
plt.show()
#效果不好,看不出来拐点
#轮廓系数法,轮廓系数最大的k最佳
Scores = [] # 存放轮廓系数
for k in range(2,kmax):
estimator = KMeans(n_clusters=k,init='k-means++') # 构造聚类器
estimator.fit(weight)
Scores.append(silhouette_score(weight,estimator.labels_,metric='euclidean'))
plt.xlabel('k')
plt.ylabel('轮廓系数')
plt.plot(range(2,kmax),Scores,'o-')
plt.show()
#k=10时相对最大
#---------聚类------------------
#print('Start Kmeans:')
clf = KMeans(n_clusters=6,init='k-means++')
s = clf.fit(weight)
print(s)
#6个中心点
centers=clf.cluster_centers_
#分类结果:每个样本所属的簇
#print(clf.labels_)
label=clf.labels_
# i = 1
# while i <= len(clf.labels_):
# print(i, clf.labels_[i-1])
# i = i + 1
#Kmeans的sse
#用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数
print(clf.inertia_)
file = open('分类结果k=6.csv','w')
for i in list(range(len(label))):
file.write(','.join((str(label[i]),str(ratecontent[i])))+'\n')
file.close()
#把每类分开,k=1,2,...6
def divide(word,k):
word_part=[]
for i in range(len(label)):
if label[i]==k:
word_part.append(word[i])
#合并所有词
wordall=[]
for w in word_part:
wordall.extend(w)
#词频统计
wordcount= {} #字典
for item in wordall:
if item not in wordcount:
wordcount[item] = 1
else:
wordcount[item] += 1
#根据词频从高到低排序
wordcount_sort=sorted(wordcount.items(),key=lambda item:item[1],reverse=True)
#可视化词云
#按照词频决定字的大小,fit_words接受的参数为字典,注意必须为字典可以直接使用结巴分词带频率的结果
wordcloud = WordCloud(background_color="white",font_path="file:///C:/windows/Fonts/MSYH.TTC",
width=1000, height=860, margin=2).fit_words(wordcount)
#perferences图片inline改为qt5
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('分类(k=6)-'+str(k)+'.png')
file = open('词频统计_分类(k=6)'+str(k)+'.csv','w')
for i in list(range(len(wordcount_sort))):
file.write(','.join((wordcount_sort[i][0],str(wordcount_sort[i][1])))+'\n')
file.close()
return 0
for k in range(6):
divide(word,k)