【机器学习】（5.4）聚类--密度聚类（DBSCAN、MDCA）-优快云博客

1. 密度聚类方法

2. DBSCAN

DBSCAN(Density-Based Spatial Clustering of Applications with Noise)。一个比较有代表性的基于密度的聚类算法。与划分和层次聚类方法不同，它将簇定义为密度相连的点的最大集合，能够把具有足够高密度的区域划分为簇，并可在有“噪声”的数据中发现任意形状的聚类。

2.1 DBSCAN算法的若干概念

2.2 DBSCAN算法流程

代码中运用了并查集，并查集详解 ——图文解说,简单易懂(转)_mjiansun的博客-优快云博客

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import sklearn.datasets as ds
from sklearn.preprocessing import StandardScaler
import math

def expand(a, b):
    d = (b - a) * 0.1
    return a-d, b+d

def euler_distance(point1: np.ndarray, point2: list) -> float:
    """
    计算两点之间的欧拉距离，支持多维
    """
    distance = 0.0
    for a, b in zip(point1, point2):
        distance += math.pow(a - b, 2)
    return math.sqrt(distance)

class UnionFindSet(object):
    def __init__(self, data_num = 0):
        self.data_num = data_num
        self.prev = {}
        self.init_uf()

    def init_uf(self):
        for i in range(self.data_num):
            self.prev[i] = i

    def union_data(self, x, y):
        x_root = self.find_data(x)
        y_root = self.find_data(y)
        if x_root != y_root:
            self.prev[y_root] = x_root

    def find_data(self, x):
        x_root = self.prev[x]
        while self.prev[x_root] != x_root:
            x_root = self.prev[x_root]

        # 路径压缩，不写这部也没事，这里只是为了加速用
        x_copy = x
        while x_copy != x_root:
            temp = self.prev[x_copy]
            self.prev[x_copy] = x_root
            x_copy = temp

        return x_root

    def get_uf_set(self):
        return self.prev


class DBSCAN(object):
    def __init__(self, eps, min_samples):
        self.eps = eps
        self.min_samples = min_samples

    def get_cluster(self, data):
        eps =self.eps
        min_samples = self.min_samples
        data_num, feature_num = data.shape
        sim = [[] for i in range(data_num)]
        for i in range(data_num):
            for j in range(data_num):
                d = euler_distance(data[i, :], data[j, :])
                if d < eps:
                    sim[i].append(j)

        uf = UnionFindSet(data_num)
        for i in range(data_num):
            single_data = sim[i]
            if len(single_data) <= min_samples:
                continue
            for j in single_data:
                uf.union_data(i, j)

        cluster_cls = {}
        cls_num = 0
        for i in range(data_num):
            r = uf.find_data(i)
            if len(sim[i]) <= min_samples:
                continue
            if r not in cluster_cls:
                cls_num += 1
                cluster_cls[r] = cls_num

        data_cls = [0] * data_num
        for i in range(data_num):
            r = uf.find_data(i)
            if r != i or len(sim[i]) > min_samples:
                data_cls[i] = cluster_cls[r]

        return data_cls

if __name__ == "__main__":
    N = 1000
    centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
    data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)
    data = StandardScaler().fit_transform(data)

    params = ((0.19, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))
    eps, min_samples = params[0]

    dbscan = DBSCAN(eps, min_samples)
    data_cls = dbscan.get_cluster(data)

    y_hat = np.array(data_cls)
    core_indices = np.zeros_like(y_hat, dtype=bool)
    core_indices[np.where(y_hat >= 1)] = True

    y_unique = np.unique(y_hat)
    n_clusters = y_unique.size - (1 if 0 in y_hat else 0)
    print(y_unique, '聚类簇的个数为：', n_clusters)

    # plt.subplot(2, 3, i + 1)
    clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
    print(clrs)
    for k, clr in zip(y_unique, clrs):
        cur = (y_hat == k)
        if k == 0:
            plt.scatter(data[cur, 0], data[cur, 1], s=10, c='k')
            continue
        plt.scatter(data[cur, 0], data[cur, 1], s=15, c=clr, edgecolors='k')
        plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=30, c=clr, marker='o',
                    edgecolors='k')
    x1_min, x2_min = np.min(data, axis=0)
    x1_max, x2_max = np.max(data, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.plot()
    plt.grid(b=True, ls=':', color='#606060')
    plt.title(r'$\epsilon$ = %.1f  m = %d，聚类数目：%d' % (eps, min_samples, n_clusters), fontsize=12)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

3. MDCA

MDCA(Maximum Density Clustering Application):将基于密度的思想引入到划分聚类中，使用密度而不是初始质心作为考察簇归属情况的依据，能够自动确定簇数量并发现任意形状的簇。MDCA一般不保留噪声，因此也避免了由于阈值选择不当而造成大量对象丢弃情况。

我是依据参考文献2实现的内容进行的复现，我不清楚是否正确，因为我对分类结果不是很满意，与我的预期有所差别。

3.1 核心思想

1.最大密度点：可用K近邻距离之和的倒数表示密度

2. 密度曲线：根据所有对象与x的欧式距离对数据集重新排序

3. 将密度曲线中第一个谷值之前的数据归为一类，并将其剔除

4. 重复步骤1，2，3直到所有的点都在ρ0之下或者ρ0之上

5. 两个簇Ci和Cj，用最近样本距离作为簇间距离

6. 根据簇间距离阈值d0，判断是否需要合并两类

3.2 代码实战

import os
import numpy as np
import sklearn.datasets as ds
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

def expand(a, b):
    d = (b - a) * 0.1
    return a-d, b+d

def find_cls(p, y_pred, distance, density_min, cls_num):
    candidate_indices = np.where(y_pred == 0)[0]
    cur_p = p[candidate_indices]
    cur_distance = distance[np.argmax(cur_p), candidate_indices]
    # 按照到最大密度点的距离从小到大排序
    cur_distance_sort = np.argsort(cur_distance)
    cur_curve = cur_p[cur_distance_sort]  # 密度按照离“最高密度”的距离排序
    ori_indices = candidate_indices[cur_distance_sort]  # 对应到最原始的索引值
    # plt.plot(cur_distance[cur_distance_sort], cur_curve)
    # plt.show()
    if np.max(cur_curve) > density_min:
        loc = np.where(cur_curve <= density_min)[0]
        loc_length = len(loc)
        if loc_length >= 2:
            for i in range(loc_length - 1):
                if loc[i + 1] - loc[i] <= 2:
                    break
            y_pred[ori_indices[:loc[i+1]]] = cls_num
            y_pred, cls_num = find_cls(p, y_pred, distance, density_min, cls_num+1)
    return y_pred, cls_num

if __name__ == "__main__":
    N = 1000
    centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
    data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.25, 0.25, 0.25, 0.25], random_state=0)
    # data = StandardScaler().fit_transform(data)
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data - min_data) / (max_data- min_data)

    # 判断密度时检测周围点的个数
    k = 3
    # 最小阈值密度，设置该值主要是使用K近邻距离之和的倒数表示密度，但是这个值怎么选出来暂时不知道
    density_min = 10000
    # 最小阈值距离
    distance_min = 0.008
    # 样本数，特征数目
    sample_num, feature_num = data.shape

    # 预测结果
    y_pred = np.zeros(sample_num)
    # p为每个样本的密度
    p = np.zeros(sample_num)
    # distance(i, j) 代表第i个样本到第j个样本的距离
    distance = np.zeros((sample_num,sample_num))

    # 利用k近邻均值定义密度较好，不会出现很多密度相同的点。如果采用半径内个数的定义方法，可能一块区域会出现很多的类别
    for i in range(sample_num):
        distance[i, :] = np.sum((data[i, :] - data)**2, axis=1)
        # temp_sort = np.sort(distance[i, :])
        # p[i] = k / np.sum(distance[i, distance[i, :]  <= temp_sort[k]])
        temp_sort = np.argsort(distance[i, :])
        p[i] = k / np.sum(distance[i, temp_sort[:k+1]])

    cls_num = 1

    # 确定初始的类簇
    y_pred, cls_num = find_cls(p, y_pred, distance, density_min, cls_num)

    # 合并距离较近的类
    while True:
        flag_1 = 0
        flag_2 = 0
        for i in range(1, cls_num):
            for j in range(i+1, cls_num):
                a = np.where(y_pred == i)[0]
                b = np.where(y_pred == j)[0]
                temp = distance[a, :][:, b]
                if np.min(temp) <= distance_min:
                    y_pred[y_pred==j] = i
                    y_pred[y_pred > j] = y_pred[y_pred > j] - 1
                    cls_num = cls_num - 1
                    flag_1 = 1
                    flag_2 = 0
                    break #跳出的第二个for
            if flag_1 == 1: #跳出的第一个for
                break
            flag_2 = 1
        if flag_2 == 1: # 跳出的while
            break

    # 合并离聚类团较小的散点
    scatter_indices = np.where(y_pred==0)[0]
    loc_indices = np.where(y_pred > 0)[0]
    scatter_cls_distance = distance[scatter_indices,:][:,loc_indices]
    scatter_cls_min_distance = np.min(scatter_cls_distance, axis=1)
    scatter_cls_min_indices = np.argmin(scatter_cls_distance, axis=1)
    a = y_pred[loc_indices[scatter_cls_min_indices]]
    b = (scatter_cls_min_distance <= distance_min)
    y_pred[scatter_indices] = y_pred[loc_indices[scatter_cls_min_indices]] * (scatter_cls_min_distance <= distance_min)

    # 画图
    y_hat = np.array(y_pred)
    core_indices = np.zeros_like(y_hat, dtype=bool)
    core_indices[np.where(y_hat >= 1)] = True

    y_unique = np.unique(y_hat)
    n_clusters = y_unique.size - (1 if 0 in y_hat else 0)
    print(y_unique, '聚类簇的个数为：', n_clusters)

    # plt.subplot(2, 3, i + 1)
    clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
    print(clrs)
    for k, clr in zip(y_unique, clrs):
        cur = (y_hat == k)
        if k == 0:
            plt.scatter(data[cur, 0], data[cur, 1], s=10, c='k')
            continue
        plt.scatter(data[cur, 0], data[cur, 1], s=15, c=clr, edgecolors='k')
        plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=30, c=clr, marker='o',
                    edgecolors='k')
    x1_min, x2_min = np.min(data, axis=0)
    x1_max, x2_max = np.max(data, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.plot()
    plt.grid(b=True, ls=':', color='#606060')
    # plt.title(r'$\epsilon$ = %.1f  m = %d，聚类数目：%d' % (1, 1, n_clusters), fontsize=12)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

    print("end")