1. 密度聚类方法
2. DBSCAN
DBSCAN(Density-Based Spatial Clustering of Applications with Noise)。一个比较有代表性的基于密度的聚类算法。与划分和层次聚类方法不同,它将簇定义为
密度相连的点的最大集合,能够把具有足够高密度的区域划分为簇,并可在有“噪声”的数据中发现任意形状的聚类。
2.1 DBSCAN算法的若干概念
2.2 DBSCAN算法流程
代码中运用了并查集,并查集详解 ——图文解说,简单易懂(转)_mjiansun的博客-优快云博客
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import sklearn.datasets as ds
from sklearn.preprocessing import StandardScaler
import math
def expand(a, b):
d = (b - a) * 0.1
return a-d, b+d
def euler_distance(point1: np.ndarray, point2: list) -> float:
"""
计算两点之间的欧拉距离,支持多维
"""
distance = 0.0
for a, b in zip(point1, point2):
distance += math.pow(a - b, 2)
return math.sqrt(distance)
class UnionFindSet(object):
def __init__(self, data_num = 0):
self.data_num = data_num
self.prev = {}
self.init_uf()
def init_uf(self):
for i in range(self.data_num):
self.prev[i] = i
def union_data(self, x, y):
x_root = self.find_data(x)
y_root = self.find_data(y)
if x_root != y_root:
self.prev[y_root] = x_root
def find_data(self, x):
x_root = self.prev[x]
while self.prev[x_root] != x_root:
x_root = self.prev[x_root]
# 路径压缩,不写这部也没事,这里只是为了加速用
x_copy = x
while x_copy != x_root:
temp = self.prev[x_copy]
self.prev[x_copy] = x_root
x_copy = temp
return x_root
def get_uf_set(self):
return self.prev
class DBSCAN(object):
def __init__(self, eps, min_samples):
self.eps = eps
self.min_samples = min_samples
def get_cluster(self, data):
eps =self.eps
min_samples = self.min_samples
data_num, feature_num = data.shape
sim = [[] for i in range(data_num)]
for i in range(data_num):
for j in range(data_num):
d = euler_distance(data[i, :], data[j, :])
if d < eps:
sim[i].append(j)
uf = UnionFindSet(data_num)
for i in range(data_num):
single_data = sim[i]
if len(single_data) <= min_samples:
continue
for j in single_data:
uf.union_data(i, j)
cluster_cls = {}
cls_num = 0
for i in range(data_num):
r = uf.find_data(i)
if len(sim[i]) <= min_samples:
continue
if r not in cluster_cls:
cls_num += 1
cluster_cls[r] = cls_num
data_cls = [0] * data_num
for i in range(data_num):
r = uf.find_data(i)
if r != i or len(sim[i]) > min_samples:
data_cls[i] = cluster_cls[r]
return data_cls
if __name__ == "__main__":
N = 1000
centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)
data = StandardScaler().fit_transform(data)
params = ((0.19, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))
eps, min_samples = params[0]
dbscan = DBSCAN(eps, min_samples)
data_cls = dbscan.get_cluster(data)
y_hat = np.array(data_cls)
core_indices = np.zeros_like(y_hat, dtype=bool)
core_indices[np.where(y_hat >= 1)] = True
y_unique = np.unique(y_hat)
n_clusters = y_unique.size - (1 if 0 in y_hat else 0)
print(y_unique, '聚类簇的个数为:', n_clusters)
# plt.subplot(2, 3, i + 1)
clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
print(clrs)
for k, clr in zip(y_unique, clrs):
cur = (y_hat == k)
if k == 0:
plt.scatter(data[cur, 0], data[cur, 1], s=10, c='k')
continue
plt.scatter(data[cur, 0], data[cur, 1], s=15, c=clr, edgecolors='k')
plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=30, c=clr, marker='o',
edgecolors='k')
x1_min, x2_min = np.min(data, axis=0)
x1_max, x2_max = np.max(data, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.plot()
plt.grid(b=True, ls=':', color='#606060')
plt.title(r'$\epsilon$ = %.1f m = %d,聚类数目:%d' % (eps, min_samples, n_clusters), fontsize=12)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
3. MDCA
MDCA(Maximum Density Clustering Application):将基于密度的思想引入到划分聚类中,使用密度而不是初始质心作为考察簇归属情况的依据,能够自动确定簇数量并发现任意形状的簇。MDCA一般不保留噪声,因此也避免了由于阈值选择不当而造成大量对象丢弃情况。
我是依据参考文献2实现的内容进行的复现,我不清楚是否正确,因为我对分类结果不是很满意,与我的预期有所差别。
3.1 核心思想
1.最大密度点:可用K近邻距离之和的倒数表示密度
2. 密度曲线:根据所有对象与x的欧式距离对数据集重新排序
3. 将密度曲线中第一个谷值之前的数据归为一类,并将其剔除
4. 重复步骤1,2,3直到所有的点都在ρ0之下或者ρ0之上
5. 两个簇Ci和Cj,用最近样本距离作为簇间距离
6. 根据簇间距离阈值d0,判断是否需要合并两类
3.2 代码实战
import os
import numpy as np
import sklearn.datasets as ds
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
def expand(a, b):
d = (b - a) * 0.1
return a-d, b+d
def find_cls(p, y_pred, distance, density_min, cls_num):
candidate_indices = np.where(y_pred == 0)[0]
cur_p = p[candidate_indices]
cur_distance = distance[np.argmax(cur_p), candidate_indices]
# 按照到最大密度点的距离从小到大排序
cur_distance_sort = np.argsort(cur_distance)
cur_curve = cur_p[cur_distance_sort] # 密度按照离“最高密度”的距离排序
ori_indices = candidate_indices[cur_distance_sort] # 对应到最原始的索引值
# plt.plot(cur_distance[cur_distance_sort], cur_curve)
# plt.show()
if np.max(cur_curve) > density_min:
loc = np.where(cur_curve <= density_min)[0]
loc_length = len(loc)
if loc_length >= 2:
for i in range(loc_length - 1):
if loc[i + 1] - loc[i] <= 2:
break
y_pred[ori_indices[:loc[i+1]]] = cls_num
y_pred, cls_num = find_cls(p, y_pred, distance, density_min, cls_num+1)
return y_pred, cls_num
if __name__ == "__main__":
N = 1000
centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.25, 0.25, 0.25, 0.25], random_state=0)
# data = StandardScaler().fit_transform(data)
max_data = np.max(data)
min_data = np.min(data)
data = (data - min_data) / (max_data- min_data)
# 判断密度时检测周围点的个数
k = 3
# 最小阈值密度,设置该值主要是使用K近邻距离之和的倒数表示密度,但是这个值怎么选出来暂时不知道
density_min = 10000
# 最小阈值距离
distance_min = 0.008
# 样本数,特征数目
sample_num, feature_num = data.shape
# 预测结果
y_pred = np.zeros(sample_num)
# p为每个样本的密度
p = np.zeros(sample_num)
# distance(i, j) 代表第i个样本到第j个样本的距离
distance = np.zeros((sample_num,sample_num))
# 利用k近邻均值定义密度较好,不会出现很多密度相同的点。如果采用半径内个数的定义方法,可能一块区域会出现很多的类别
for i in range(sample_num):
distance[i, :] = np.sum((data[i, :] - data)**2, axis=1)
# temp_sort = np.sort(distance[i, :])
# p[i] = k / np.sum(distance[i, distance[i, :] <= temp_sort[k]])
temp_sort = np.argsort(distance[i, :])
p[i] = k / np.sum(distance[i, temp_sort[:k+1]])
cls_num = 1
# 确定初始的类簇
y_pred, cls_num = find_cls(p, y_pred, distance, density_min, cls_num)
# 合并距离较近的类
while True:
flag_1 = 0
flag_2 = 0
for i in range(1, cls_num):
for j in range(i+1, cls_num):
a = np.where(y_pred == i)[0]
b = np.where(y_pred == j)[0]
temp = distance[a, :][:, b]
if np.min(temp) <= distance_min:
y_pred[y_pred==j] = i
y_pred[y_pred > j] = y_pred[y_pred > j] - 1
cls_num = cls_num - 1
flag_1 = 1
flag_2 = 0
break #跳出的第二个for
if flag_1 == 1: #跳出的第一个for
break
flag_2 = 1
if flag_2 == 1: # 跳出的while
break
# 合并离聚类团较小的散点
scatter_indices = np.where(y_pred==0)[0]
loc_indices = np.where(y_pred > 0)[0]
scatter_cls_distance = distance[scatter_indices,:][:,loc_indices]
scatter_cls_min_distance = np.min(scatter_cls_distance, axis=1)
scatter_cls_min_indices = np.argmin(scatter_cls_distance, axis=1)
a = y_pred[loc_indices[scatter_cls_min_indices]]
b = (scatter_cls_min_distance <= distance_min)
y_pred[scatter_indices] = y_pred[loc_indices[scatter_cls_min_indices]] * (scatter_cls_min_distance <= distance_min)
# 画图
y_hat = np.array(y_pred)
core_indices = np.zeros_like(y_hat, dtype=bool)
core_indices[np.where(y_hat >= 1)] = True
y_unique = np.unique(y_hat)
n_clusters = y_unique.size - (1 if 0 in y_hat else 0)
print(y_unique, '聚类簇的个数为:', n_clusters)
# plt.subplot(2, 3, i + 1)
clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
print(clrs)
for k, clr in zip(y_unique, clrs):
cur = (y_hat == k)
if k == 0:
plt.scatter(data[cur, 0], data[cur, 1], s=10, c='k')
continue
plt.scatter(data[cur, 0], data[cur, 1], s=15, c=clr, edgecolors='k')
plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=30, c=clr, marker='o',
edgecolors='k')
x1_min, x2_min = np.min(data, axis=0)
x1_max, x2_max = np.max(data, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.plot()
plt.grid(b=True, ls=':', color='#606060')
# plt.title(r'$\epsilon$ = %.1f m = %d,聚类数目:%d' % (1, 1, n_clusters), fontsize=12)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
print("end")
3.3 实验结果
3.4 性能比较(个人感觉不太好用,参数太难调了)
- 优点:
- 对噪声数据不敏感
- 不依赖初始数据点的选择
- 可以完成任意形状的聚类
- 缺点:
- 算法复杂,分类速度较慢
- 需要在测试前确定密度阈值
- 对于高维数据,距离的度量并不是很好
- 不适合数据集密度差异较大或整体密度基本相同的情况