聚类算法相关:
原理
谱聚类是一种基于图论的聚类方法,通过对样本数据的拉普拉斯矩阵的特征向量进行聚类,从而达到对样本数据聚类的目的。谱聚类可以理解为将高维空间的数据映射到低维,然后在低维空间用其它聚类算法(如KMeans)进行聚类。
代码
'''
谱聚类方法
参考github代码:https://github.com/leekeiling/Cluster/blob/master/spectral.py
原理介绍:https://blog.youkuaiyun.com/qq_24519677/article/details/82291867
'''
from sklearn.cluster import KMeans
import numpy as np
import math as m
import matplotlib.pyplot as plt
import pandas as pd
from collections import OrderedDict
from cluster_v2 import Embedding, load_keywords
# import evaluate as eval
# flame.txt
# Jain_cluster=2.txt
# Aggregation_cluster=7.txt
# Spiral_cluster=3.txt
# Pathbased_cluster=3.txt
data_path = "flame.txt"
def load_data():
"""
导入数据
:return:
"""
points = np.loadtxt(data_path, delimiter='\t')
return points
def get_dis_matrix(data):
"""
获得邻接矩阵
:param data: 样本集合
:return: 邻接矩阵
"""
nPoint = len(data)
dis_matrix = np.zeros((nPoint, nPoint))
for i in range(nPoint):
for j in range(i + 1, nPoint):
dis_matrix[i][j] = dis_matrix[j][i] = m.sqrt(np.power(data[i] - data[j], 2).sum())
return dis_matrix
def getW(data, k):
"""
利用KNN获得相似矩阵
:param data: 样本集合
:param k: KNN参数
:return:
"""
dis_matrix = get_dis_matrix(data)
W = np.zeros((len(data), len(data)))
for idx, each in enumerate(dis_matrix):
index_array = np.argsort(each)
W[idx][index_array[1:k+1]] = 1
tmp_W = np.transpose(W)
W = (tmp_W+W)/2
return W
def getD(W):
"""
获得度矩阵
:param W: 相似度矩阵
:return: 度矩阵
"""
D = np.diag(sum(W))
return D
def getL(D, W):
"""
获得拉普拉斯举着
:param W: 相似度矩阵
:param D: 度矩阵
:return: 拉普拉斯矩阵
"""
return D - W
def getEigen(L):
"""
从拉普拉斯矩阵获得特征矩阵
:param L: 拉普拉斯矩阵
:return:
"""
eigval, eigvec = np.linalg.eig(L)
ix = np.argsort(eigval)[0:cluster_num]
return eigvec[:, ix]
def plotRes(data, clusterResult, clusterNum):
"""
结果可视化
:param data: 样本集
:param clusterResult: 聚类结果
:param clusterNum: 聚类个数
:return:
"""
nPoints = len(data)
scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange']
for i in range(clusterNum):
color = scatterColors[i % len(scatterColors)]
x1 = []; y1 = []
for j in range(nPoints):
if clusterResult[j] == i:
x1.append(data[j, 0])
y1.append(data[j, 1])
plt.scatter(x1, y1, c=color, alpha=1, marker='+')
plt.show()
if __name__ == '__main__':
cluster_num = 10
KNN_k = 5
keywords = [] # 需要聚类的词语 句子的话可以采用fasttext向量或者TfidfVectorizer, 参见开头其它聚类文章
embedding_file2 = ''
embedding_model2 = Embedding(embedding_file2, 200, type='w2v')
X = [embedding_model2.get_word_embedding(keyword) for keyword in keywords]
data = X
# data = load_data()
data = np.asarray(data)
W = getW(data, KNN_k)
D = getD(W)
L = getL(D, W)
eigvec = getEigen(L)
clf = KMeans(n_clusters=cluster_num)
s = clf.fit(eigvec)
labels = s.labels_
# plotRes(data, np.asarray(C), 7) # plot 展示
print(labels)
data = []
dic = OrderedDict()
for keyword, label in zip(keywords, labels):
dic.setdefault(label, [])
dic[label].append(keyword)
for item in dic:
data.extend([[x, item] for x in dic[item]])
data.append([])
df = pd.DataFrame(data=data, columns=['keyword', 'class_index'])
df.to_excel(result_file, index=False)
参考链接:
https://blog.youkuaiyun.com/qq_24519677/article/details/82291867
https://github.com/leekeiling/Cluster/blob/master/spectral.py