谱聚类,谱就是指矩阵所有的特征值的集合;而矩阵指的是由所有数据形成的图的Laplacian矩阵。因此谱聚类就是计算数据的Laplacian矩阵的特征向量,再取特征向量中的一部分进行Kmeans聚类。
but,为什么是Laplacian矩阵?为什么不直接对原始数据Kmeans聚类?这也就是谱聚类实现起来很简单,但是原理却比较难的原因吧。
1. 为什么是Laplacian矩阵?
2. 为什么不直接对原始数据Kmeans聚类?
3. 算法实现
算法步骤:
1)将数据图化, 每个节点代表一个样本;
2)计算样本间的相似度, 形成邻接矩阵;
3)计算度矩阵, 邻接矩阵的每一列的和生成的对角矩阵;
4)计算Laplacian矩阵;
5)计算Laplacian矩阵的特征值,特征向量;
6)取前k个特征向量形成的矩阵,利用Kmeans聚类;
# -*- coding: utf-8 -*-
"""
谱聚类 - spectral clustering
1. 将数据图化, 每个节点代表一个样本;
2. 计算样本间的相似度, 形成邻接矩阵;
3. 计算度矩阵, 邻接矩阵的每一列的和生成的对角矩阵;
4. 计算Laplacian矩阵;
5. 计算Laplacian矩阵的特征向量;
6. 取前k个特征向量形成的矩阵,利用Kmeans聚类;
"""
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets
import matplotlib.pyplot as plt
class Spectral:
def __init__(self):
pass
def _cal_adjacentMatrix(self, train_x):
""" 计算图的邻接矩阵 - 相似度矩阵 """
sampleCnt = len(train_x)
distance = np.zeros((len(train_x), len(train_x)))
for i in range(sampleCnt):
distance[i, :] = np.sqrt(np.sum((train_x - train_x[i,:])**2, axis=1))
return distance
def _cal_degreeMatrix(self, adjacentMat):
""" 计算度矩阵
adjacentMat: 图的邻接矩阵
return: 度矩阵
"""
return np.diag(np.sum(adjacentMat, axis=0))
def _cal_LaplacianMat(self, adjacentMat, degreeMat):
""" 计算Laplacian矩阵, 并归一化
adjacentMat: 图的邻接矩阵
degreeMat: 度矩阵
"""
LaplacianMat = degreeMat - adjacentMat
# LaplacianMat规范化
for i in range(len(degreeMat)):
degreeMat[i, i] = degreeMat[i, i]**(-1/2)
LaplacianMat = - np.dot(np.dot(degreeMat, adjacentMat), degreeMat)
return LaplacianMat
def _eig_for_LaplacianMat(self, LaplacianMatrix):
""" 对laplacian矩阵进行特征值分解 """
featureVal, featureVector = np.linalg.eig(LaplacianMatrix)
return featureVal, featureVector
def clustering(self, k, **params):
adjacentMat = self._cal_adjacentMatrix(train_x)
degreeMat = self._cal_degreeMatrix(adjacentMat)
laplacianMat = self._cal_LaplacianMat(adjacentMat, degreeMat)
featureVal, featureVector = self._eig_for_LaplacianMat(laplacianMat)
# 选取Laplacian矩阵特征向量前k个, Kmeans聚类
clf = KMeans(**params)
clf.fit(featureVector[:, 0:k])
return clf.labels_
def load_data():
""" 利用sklearn自动生成circles数据集 """
params = {
"n_samples": 500,
"shuffle": True,
"noise": 0.03,
"random_state": 1,
"factor": 0.618
}
circles = datasets.make_circles(**params)
return circles[0]
def draw_result(train_x, clusters):
plt.figure()
colors = ["red", "orange", "purple", "yellow", "blue"]
for i in range(params["n_clusters"]):
data = train_x[clusters == i]
plt.scatter(data[:, 0], data[:, 1], color=colors[i], s=20)
plt.show()
if __name__ == "__main__":
train_x = load_data()
obj = Spectral()
params = {
"n_clusters": 2,
"init": "k-means++",
"n_init": 10,
"max_iter": 300,
"tol": 1e-6,
"random_state": 1,
}
# k的选择非常重要
clusters = obj.clustering(k=6, **params)
draw_result(train_x, clusters)
# -*- coding: utf-8 -*-
from sklearn.cluster import SpectralClustering
from sklearn import datasets
import matplotlib.pyplot as plt
# 导入数据
train = datasets.make_circles(n_samples=500, shuffle=True, noise=0.03, random_state=1, factor=0.618)
train_x = train[0]
# SpectralClusteerring模型
params = {
"n_clusters": 2,
"eigen_solver": "arpack",
"affinity": "nearest_neighbors",
"random_state": 1,
"n_init": 10
}
clf = SpectralClustering(**params)
clusters = clf.fit_predict(train_x)
# 聚类结果画图
plt.figure()
colors = ["red", "orange", "purple", "yellow", "blue"]
for i in range(params["n_clusters"]):
data = train_x[clusters == i]
plt.scatter(data[:, 0], data[:, 1], color=colors[i], s=20)
plt.show()
参考文献
http://blog.youkuaiyun.com/zhangyi880405/article/details/39781817
http://www.cnblogs.com/Leo_wl/p/3156049.html
http://www.cnblogs.com/pinard/p/6221564.html
http://www.cnblogs.com/FengYan/archive/2012/06/21/2553999.html
http://blog.youkuaiyun.com/yansmile1/article/details/48346141
https://wenku.baidu.com/view/36f06d78a32d7375a5178025.html