层次聚类算法很简单
类似于树的结构
树自下而上更新
即n->1个簇的更新
具体见代码
增加了树的几点操作函数
应用暂时没有
后续会更新
# -*- coding:utf-8 _*_
import numpy
def getMax(x, y):
if x > y:
return x
else:
return y
def getMin(x, y):
if x > y:
return y
else:
return x
def loadData(fileName):
data = []
with open(fileName) as txtFile:
for line in txtFile.readlines():
init = map(float, line.split()) # 全部转换为float类型
data.append(init)
return data
# 树的节点
class clusterNode:
# vec为坐标 lr左右节点 目前没有get到distance的作用
def __init__(self, vec, left=None, right=None, id=None, distance=0):
self.vec = vec
self.left = left
self.right = right
self.id = id
self.distance = distance
def L2Distance(v1, v2):
return numpy.sqrt(sum(pow(v1 - v2, 2)))
deep = 0 # 得到树的深度
def getDeep(node, high):
if node.left is None and node.right is None:
global deep
deep = getMax(deep, high)
else:
print node.vec
getDeep(node.left, high + 1)
getDeep(node.right, high + 1)
leavesNum = 0 # 得到树的叶子树
def getLeaves(node):
if node.left is None and node.right is None:
global leavesNum
leavesNum += 1
if node.left is not None:
getLeaves(node.left)
if node.right is not None:
getLeaves(node.right)
def hiCluster(dataSet, calDistance=L2Distance):
# 利用list建立树形clust 在中括号内循环
clust = [clusterNode(numpy.array(dataSet[i]), id=i)
for i in range(len(dataSet))]
while len(clust) > 1: # 没有合并完成
lowestpair = (0, 1) # 默认01最近 并求距离
closest = calDistance(clust[0].vec, clust[1].vec)
# 两两求距离 打擂台找最近的"点" (其实是簇)
for i in range(len(clust)):
for j in range(i + 1, len(clust)):
temp = calDistance(clust[i].vec, clust[j].vec)
if closest > temp:
closest = temp
lowestpair = (i, j)# 重新声明tuple (元组无法赋值)
# 新的簇 中值加和得来
mergeVec = [(clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i]) / 2.0
for i in range(len(clust[0].vec))]
# 合并后的"质心"
newcluster = clusterNode(numpy.array(mergeVec), left=clust[lowestpair[0]],
right=clust[lowestpair[1]], distance=closest)
# 删除两个距离最近的已经合并的簇 从大的开始删
Max = getMax(lowestpair[0], lowestpair[1])
Min = getMin(lowestpair[0], lowestpair[1])
del (clust[Max])
del (clust[Min])
# 添加新的簇
clust.append(newcluster)
# 返回的是类的对象 clust[0]是树根
return clust[0]
if __name__ == "__main__":
dataSet = loadData("testSet.txt")
root = hiCluster(dataSet)
print "node is"
getDeep(root, 0)
print "deep is", deep
getLeaves(root)
print "leavesNum is", leavesNum