# !/usr/bin/env python # -*-coding:utf-8-*- # Author:TG from numpy import * # 从文件加载数据 def loadDataSet(fileName): dataMat = [] # assume last column is target value fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = map(float, curLine) # map all elements to float() dataMat.append(fltLine) return dataMat # 计算两个向量的欧式距离 def distEclud(vecA, vecB): return sqrt(sum(power(vecA - vecB, 2))) # 构建簇的质心 def ranCent(dataSet, k): n = shape(dataSet)[1] centorids = mat(zeros((k, n))) # create centorid mat for j in range(n): # create random cluster centers, within bounds of each dimension minJ = min(dataSet[:, j]) rangeJ = float(max(dataSet[:, j]) - minJ) centorids[:, j] = mat(minJ + rangeJ * random.rand(k, 1)) return centorids # K-均值聚类算法 def kMeans(dataSet, k, distMeas=distEclud, createCent=ranCent): m = shape(dataSet)[0] # create mat to assign data points clusterAssment = mat(zeros((m, 2))) # to a centorid, also holds SE of each point # 创建K个点作为质心 centorids = createCent(dataSet, k) clusterChanged = True # 对任意一个点的簇分配结果发生改变时 while clusterChanged: clusterChanged = False # 对数据集中的每个数据点 for i in range(m): # for each data point assign it to the closest centorid minDist = inf; minIndex = -1 # 对每个质心 for j in range(k): # 计算距离 distJI = distMeas(centorids[j, :], dataSet[i, :]) # 将数据点分配到和其距离最近的簇 if distJI < minDist: minDist = distJI; minIndex = j # 重新计算质心 if clusterAssment[i, 0] != minIndex: clusterChanged = True clusterAssment[i, :] = minIndex, minDist ** 2 print centorids for cent in range(k): # recalculate centroids # get all the point in this cluster ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]] # assign centorid to mean centorids[cent, :] = mean(ptsInClust, axis=0) return centorids, clusterAssment
K-Means算法-Python实现
最新推荐文章于 2024-04-29 18:16:07 发布