hierarchal cluster （层次聚类，complete linkage）

最新推荐文章于 2024-10-21 06:15:55 发布

原创最新推荐文章于 2024-10-21 06:15:55 发布 · 4.2k 阅读

4 ·

CC 4.0 BY-SA版权

机器学习同时被 3 个专栏收录

6 篇文章

订阅专栏

数据挖掘

6 篇文章

订阅专栏

机器/深度学习

5 篇文章

订阅专栏

本文介绍了complete linkage分层聚类，它以簇间结点最长距离为簇间距，每次合并所有簇间距最短的两簇。给出了具体train流程，包括建立结点两两距离字典并排序，循环进行分层聚类，维护以簇间最长距离为簇间距的字典等，还指出其时间复杂度比single的要高。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

上一篇博客介绍了single linkage是让所有簇的距离为簇间结点最短的距离，同时每一次合并所有簇间距中最短的那一个。

而complete则是让簇间结点距离最长的作为簇间距，并且每一次让所有簇间距最短两簇进行合并，因此实际上不是都找最长，而是最长最短。

因此对于基于上一篇的实现方式，complete linkage则需要在每一次循环中维护一个存储以簇间最长距离而作为簇间距的字典，同时字典的key为组成这个距离的两个node的id。

具体train流程如下：

1.建立一个所有结点两两距离为value，生成距离的两个结点为key的字典rank_list，并以距离从小到大进行排序。

2.循环进行分层聚类，循环次数为结点的个数-1（每一次生成合并两个簇）。

2.1 每一次大循环维护一个存储以簇间最长距离而作为簇间距的字典，同时字典的key为组成这个距离的两个node的id。并且以这个字典的最小的value距离作为合并的linkage，合并两个点，更新结点集。

2.2在内循环中生成2.1所述的字典，每次都遍历所有的rank_list的所有对象，如果value在字典的key为空或者该key的value小于此次遍历到的value则进行替换。

时间复杂度似乎比single的要高。。

具体代码如下：

# -*- coding: utf-8 -*
from __future__ import division
import numpy as np
import math

# calculate the euler disctance with two array
def euler_distance(a,b):
    dist = np.sqrt(np.sum(np.square(a-b)))
    return dist

# define the cluster class
class ClusterNode(object):
    #initialize the nodes
    def __init__(self,left=None,right=None,distance=-1,count=1,id=None,father=None,data=None):
        self.left = left
        self.right = right
        self.distance = distance
        self.count = count
        self.id = id
        self.father = father
        self.data = data



class Hierarchical(object):
    # define the stop point
    def __init__(self,k=1):
        assert k>0
        self.k = k;
        self.labels = None
    def train(self,x):
        nodes = [ClusterNode(id=i,data=x[i])for i in range(len(x))]
        newnode_id_num = 14
        nodes_len = len(nodes)
        #dictionary
        distance_list = {}
        rank_list = []
        # dim
        points_num,features_num = np.shape(x)
        # initialize the labels
        self.labels = [-1]*points_num
        curr_clustid = -1

        # Calculate all the distance and get the rank in dictionary
        for i in range(nodes_len-1):
            for j in range(i+1,nodes_len):
                d_key = (nodes[i].id,nodes[j].id)
                # print nodes[i].id[1]
                distance_list[d_key] = euler_distance(nodes[i].data,nodes[j].data)
                # sort the distance
                rank_list = sorted(distance_list.items(),key = lambda item:item[1])
        # print rank_list
        # stop condition is assert k
        # each out loop just merge two parts
        loop_times = 0
        # 13 loops match the numbers of no-leaf nodes
        for i in range(12):
            Complete_distance={}
            for j in range(len(rank_list)):
                nodes_id1,nodes_id2 = rank_list[j][0]
                node1,node2 = nodes[nodes_id1],nodes[nodes_id2]
                nodeptr1 = node1
                nodeptr2 = node2
                while nodeptr1.father != None:
                    nodeptr1 = nodeptr1.father
                while nodeptr2.father != None:
                    nodeptr2 = nodeptr2.father
                if nodeptr1 == nodeptr2:
                    continue
                # if it's the distance between the clusters
                # if it's not in the dict or dict value smaller than current, change the dict value
                else:
                    m_key = (nodeptr1.id,nodeptr2.id)
                    if (m_key not in Complete_distance.keys()) or Complete_distance[m_key]<rank_list[j][1]:
                        Complete_distance[m_key]=rank_list[j][1]

             # right now the dict storing the complete linkage list with key
            new_Com = sorted(Complete_distance.items(),key=lambda item:item[1])

            for l in range(len(nodes)):
                if nodes[l].id==new_Com[0][0][0]:
                    Node1 = nodes[l]
                if nodes[l].id==new_Com[0][0][1]:
                    Node2 = nodes[l]

            # Node1 = nodes[new_Com[0][0][0]]
            # Node2 = nodes[new_Com[0][0][1]]
            new_node = ClusterNode(left=Node1, right=Node2, distance=new_Com[0][1],
                                   count=Node1.count + Node2.count, id=newnode_id_num)
            Node1.father=new_node
            Node2.father=new_node
            newnode_id_num += 1
            print 'In loop',i,'the complete linkage is:',new_node.distance
            nodes.append(new_node)



        self.nodes = nodes
        self.Label()
    def Label(self):
        # From the last to the first to label these fucking nodes
        label = 0
        for node in reversed(self.nodes):
            self.leaf_traversal(node,label)
            label += 1

    # traversal the leaf nodes to label
    def leaf_traversal(self,node,label):
        if node.left == None and node.right == None:
            if self.labels[node.id] == -1:
                self.labels[node.id] = label
        if node.left:
            self.leaf_traversal(node.left,label)
        if node.right:
            self.leaf_traversal(node.right,label)
def Label_Normalization(labels):
    for i in range(len(labels)):
        if labels[i]!=0:
            labels[i]=0.0
        elif labels[i]==0:
            labels[i]=1.0
    return labels
def loadDataSet(fileName):
    xArr = [];
    yArr = []
    for line in open(fileName).readlines():
        curLine = line.strip().split()
        # curLine = line.strip().split('\t')
        xonerow = []
        for i in range(len(curLine) - 1):

            xonerow.append(float(curLine[i]))
        xArr.append(xonerow)
        yArr.append(float(curLine[-1]))

    return xArr, yArr

if __name__ =="__main__":
    train_x,train_y = loadDataSet('Hierarchical.txt')
    Hierarchy = Hierarchical(k=26)
    print np.array(train_x).shape[0]

    Hierarchy.train(np.array(train_x))
    labels_result = np.array(Hierarchy.labels)
    labels = Label_Normalization(labels_result)
    print 'Final two clusters are:\n',labels
    print 'Real labels are:\n',train_y

    np.array(train_y)
    # accuracy calculate
    sum  = 0
    for i in range(len(labels)):
        if labels[i]==train_y[i]:
            sum +=1
    print 'While the clusters stop at 2 cluster like the real labels of these data, the accuracy is:\n',sum/len(labels)