#coding=utf8
'''
算法流程:
输入:k, data[n];
(1) 选择k个初始中心点,例如c[0]=data[0],…c[k-1]=data[k-1];
(2) 对于data[0]….data[n], 分别与c[0]…c[k-1]比较,假定与c[i]差值最少,就标记为i;
(3) 对于所有标记为i点,重新计算 c[i]={ 所有标记为i的data[j]之和}/标记为i 的个数;
(4) 重复(2)(3),直到所有c[i]值的变化小于给定阈值。
算法说明:
距离采用欧氏距离
距离公式影响 不同距离公式导致 逼近方式不同
'''
import time
import random
import math
import sys
class Kmeans:
def __init__(self,data,k,error):
self.data=data
self.k=k
self.error=error
self.result=self.__kmeans()
def __kmeans(self):
resultList=[]
resultBefore=[]
error_temp=99999 #表示阈值
flag=0
while(len(resultList)<self.k):
flag+=1
data1=random.choice(self.data)
if data1 not in resultList:
resultList.append(data1)
if flag==len(self.data):
print "No more classes , The Program will end in 2 Seconds"
time.sleep(2)
sys.exit()
while(error_temp>self.error):
indexList=[0 for i in range(len(self.data))]
resultBefore=resultList[:]
resultNum=[]
errorList=[]
for i in range(len(self.data)):
num=self.__getMin(resultList,self.data[i])
#print self.data[i],resultList,num
indexList[i]=num
resultNum=self.__getListNum(indexList,self.k)
'''
print resultList,
'''
#重新计算resultList值
resultList=[[0 for i in range(len(self.data[0]))] for j in range(self.k)]
"""
print "data"
print self.data,
print "index"
print indexList,
print "number"
print resultNum
"""
for j in range(len(indexList)):
for t in range(len(self.data[0])):
if resultNum[indexList[j]]>0:
resultList[indexList[j]][t]+=1.0*self.data[j][t]/resultNum[indexList[j]]
else:
resultList[indexList[j]][t]+=0
#计算error值
error_r=99999
for i in range(len(resultList)):
distance=0
for t in range(len(self.data[0])):#数据的维度
distance+=(resultList[i][t]-resultBefore[i][t])**2
distance=math.sqrt(distance)
if error_r>distance:
error_r=distance
error_temp=error_r
print resultBefore,resultList
print error_temp
return resultList
def __getMin(self,res,dat):
minC=999999
index=0
for i in range(len(res)):
distance=0
for k in range(len(dat)):
distance+=(dat[k]-res[i][k])**2
distance=math.sqrt(distance)
if minC>distance:
index=i
minC=distance
print res,dat,minC,index
return index
def __getListNum(self,lis1,k):
res=[0 for i in range(k)]
for i in range(len(lis1)):
res[lis1[i]]+=1
return res
data=[[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],[7],\
[8],[9],[10],[2],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],\
[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],[3.7765],\
[3.7765],[3.7765],[3.7765],[3.7765]]
time1=time.clock()
k1=Kmeans(data,7,0)
time2=time.clock()
print "time cost is :" ,time2-time1
print k1.result
分析:
1、强烈的初值依赖
1、均布式的初值选择方式
2、提前聚类
2、迭代次数选择
1、类内指标(距离)
2、类间指标(距离)