C++实现:
#include <bits/stdc++.h>
using namespace std;
const int MAX = 1e6+10;
struct hh{
double x,y;
}a[MAX],b[MAX],c[MAX];
int k,n;
vector<int> vv[MAX];
void init(){
ifstream in("texts.txt");
string s;
int cnt=0;
while(getline(in,s)){//着行读取数据并存于s中,直至数据全部读取
cnt++;
int sum=0;
int index;
for (int i = 0; i < s.size();i++){
if(s[i]!=' '){
sum=sum*10+(s[i]-'0');
}
else{
a[cnt].x=sum;
index=i+1;
break;
}
}
sum=0;
for (int i = index; i < s.size();i++){
sum=sum*10+(s[i]-'0');
}
a[cnt].y=sum;
}
n=cnt;
}
bool f=false;
bool judge(){
bool ff=true;
for (int i = 1; i <= k;i++){
double xx=0;
double yy=0;
for (int j = 0; j < vv[i].size();j++){
xx+=a[vv[i][j]].x;
yy+=a[vv[i][j]].y;
}
if(vv[i].size()!=0){
xx/=vv[i].size();
yy/=vv[i].size();
c[i].x=xx;
c[i].y=yy;
}
if(xx!=b[i].x||yy!=b[i].y){
ff=false;
}
}
if(ff) return true;
else return false;
}
void kms(int x){
if(f) return;
if(x==0){
for (int i = k+1; i <= n;i++){
b[i-k].x=a[i-k].x;
b[i-k].y=a[i-k].y;
vv[i-k].push_back(i-k);
double minn=1e9;
int index=0;
for (int j = 1; j <= k;j++){
double ww=(a[i].x-a[j].x)*(a[i].x-a[j].x)+(a[i].y-a[j].y)*(a[i].y-a[j].y);
if(ww<minn){
minn=ww;
index=j;
}
}
vv[index].push_back(i);
}
f=judge();
if(f) return;
else{
for (int i = 1; i <= k;i++){
vv[i].clear();
b[i].x=c[i].x;
b[i].y=c[i].y;
}
kms(x+1);
}
}
else{
for (int i = 1; i <= n;i++){
double minn=1e9;
int index=0;
for (int j = 1; j <= k;j++){
double ww=(a[i].x-b[j].x)*(a[i].x-b[j].x)+(a[i].y-b[j].y)*(a[i].y-b[j].y);
if(ww<minn){
minn=ww;
index=j;
}
}
vv[index].push_back(i);
}
f=judge();
if(f) return;
else{
for (int i = 1; i <= k;i++){
vv[i].clear();
b[i].x=c[i].x;
b[i].y=c[i].y;
}
kms(x+1);
}
}
}
int main(){
init();
printf("请输入k:");
scanf("%d",&k);
kms(0);
for (int i = 1; i <= k;i++){
printf("%f %f\n",c[i].x,c[i].y);
for (int j = 0; j < vv[i].size();j++){
printf("%d ",vv[i][j]);
}
puts("");
}
return 0;
}
Python实现:
# encoding: utf-8
#导入tests.txt中的数据,利用K均值算法对进行分类
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet(fileName): #读入数据
dataSet = []
f = open(fileName)
for line in f.readlines():
curLine = line.strip().split('\t') #读入一行 形式为字符串
fltLine = list(map(float, curLine)) #转换成浮点型
dataSet.append(fltLine)
return mat(dataSet) #转换成矩阵
#求两个向量的欧式距离
def distEclud(vecA, vecB):
return sum(power(vecA - vecB, 2))
def randCent(dataSet, k):
n = shape(dataSet)[1] #n是列数
center = mat(zeros((k, n))) #初始化矩阵
for j in range(n):
minJ = min(dataSet[:, j]) #找到第j列最小值
rangeJ = float(max(dataSet[:, j]) - minJ) #求第j列最大值与最小值的差
center[:, j] = minJ + rangeJ * random.rand(k, 1) #生成k行1列的在(0, 1)之间的随机数矩阵
return center
def KMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0] #数据集的行
initM = mat(zeros((m, 2))) #初始化矩阵
center = createCent(dataSet, k)
flag = True
while flag:
flag = False
for i in range(m): #遍历数据集中的每一行数据
minDist = inf; minIndex = -1
for j in range(k): #寻找最近质心
distJI = distMeas(center[j, :], dataSet[i, :])
if distJI < minDist: #更新最小距离和质心下标
minDist = distJI; minIndex = j
if initM[i, 0] != minIndex:
flag = True
initM[i, :] = minIndex, minDist #记录最小距离质心下标,最小距离的平方
for cent in range(k): #更新质心位置
ptsInClust = dataSet[nonzero(initM[:,0].A==cent)[0]] #获得距离同一个质心最近的所有点的下标,即同一簇的坐标
center[cent,:] = mean(ptsInClust, axis=0) #求同一簇的坐标平均值,axis=0表示按列求均值
return center, initM
def showCluster(dataSet, k, initM, center): #画图表示,从网上学的
fig = plt.figure()
ax = fig.add_subplot(111)
plt.title("K-means")
data = []
for cent in range(k): #提取出每个簇的数据
ptsInClust = dataSet[nonzero(initM[:,0].A==cent)[0]] #获得属于cent簇的数据
data.append(ptsInClust)
for cent, c, marker in zip( range(k), ['r', 'g', 'b', 'y'], ['^', 'o', '*', 's'] ): #画出数据点散点图
ax.scatter(array(data[cent][:, 0]), array(data[cent][:, 1]), s=80, c=c, marker=marker)
ax.scatter(array(center[:, 0]), array(center[:, 1]), s=1000, c='black', marker='+', alpha=1) #画出质心点
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
plt.show()
if __name__ == "__main__":
dataSet = loadDataSet('texts.txt')
print("请输入k:")
k=int(input())
center, initM = KMeans(dataSet, k)
showCluster(dataSet, k, initM,center) #画图表示,从网上学的
数据:texts.txt
1 1
2 1
1 2
2 2
4 3
5 3
4 4
5 4
优化biKmeans
# encoding: utf-8
#导入tests.txt中的数据,利用K均值算法对进行分类
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet(fileName): #读入数据
dataSet = []
f = open(fileName)
for line in f.readlines():
curLine = line.strip().split('\t') #读入一行 形式为字符串
fltLine = list(map(float, curLine)) #转换成浮点型
dataSet.append(fltLine)
return mat(dataSet) #转换成矩阵
#求两个向量的欧式距离
def distEclud(vecA, vecB):
return sum(power(vecA - vecB, 2))
def randCent(dataSet, k):
n = shape(dataSet)[1] #n是列数
center = mat(zeros((k, n))) #初始化矩阵
for j in range(n):
minJ = min(dataSet[:, j]) #找到第j列最小值
rangeJ = float(max(dataSet[:, j]) - minJ) #求第j列最大值与最小值的差
center[:, j] = minJ + rangeJ * random.rand(k, 1) #生成k行1列的在(0, 1)之间的随机数矩阵
return center
def KMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0] #数据集的行
initM = mat(zeros((m, 2))) #初始化矩阵
center = createCent(dataSet, k)
flag = True
while flag:
flag = False
for i in range(m): #遍历数据集中的每一行数据
minDist = inf; minIndex = -1
for j in range(k): #寻找最近质心
distJI = distMeas(center[j, :], dataSet[i, :])
if distJI < minDist: #更新最小距离和质心下标
minDist = distJI; minIndex = j
if initM[i, 0] != minIndex:
flag = True
initM[i, :] = minIndex, minDist #记录最小距离质心下标,最小距离的平方
for cent in range(k): #更新质心位置
ptsInClust = dataSet[nonzero(initM[:,0].A==cent)[0]] #获得距离同一个质心最近的所有点的下标,即同一簇的坐标
center[cent,:] = mean(ptsInClust, axis=0) #求同一簇的坐标平均值,axis=0表示按列求均值
return center, initM
def biKmeans(dataSet, k, distMeas=distEclud):
m = shape(dataSet)[0] #数据集的行
initM = mat(zeros((m,2))) #初始化矩阵
centroid0 = mean(dataSet, axis=0) #求所有数据的平均值,axis=0表示按列求均值
centList =[centroid0]
for j in range(m): #计算初始误差平方和
initM[j,1] = distMeas(mat(centroid0), dataSet[j,:])
while (len(centList) < k): #只要聚类的个数小于等于k
lowestSSE = inf
for i in range(len(centList)):
ptsInCurrCluster = dataSet[nonzero(initM[:,0].A==i)[0],:] #获得属于cent簇的数据
centroidMat, splitClustAss = KMeans(ptsInCurrCluster, 2, distMeas) #对一个簇中的数据进行kMeans
sseSplit = sum(splitClustAss[:,1]) #计算本次划分误差平方和
sseNotSplit = sum(initM[nonzero(initM[:,0].A!=i)[0],1]) #计算不属于cent簇的误差平方和
print ("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
if (sseSplit + sseNotSplit) < lowestSSE: #如果有效降低了误差平方和,则记录
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #将划分簇的编号转为新加簇的编号
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit #更新原始的该簇质心编号
print ('the bestCentToSplit is: ',bestCentToSplit)
print ('the len of bestClustAss is: ', len(bestClustAss))
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#将一个质心更新为两个质心
centList.append(bestNewCents[1,:].tolist()[0])
initM[nonzero(initM[:,0].A == bestCentToSplit)[0],:]= bestClustAss #更新数据的编号以及误差平方和
return mat(centList), initM
def showCluster(dataSet, k, initM, center): #画图表示,从网上学的
fig = plt.figure()
ax = fig.add_subplot(111)
plt.title("K-means")
data = []
for cent in range(k): #提取出每个簇的数据
ptsInClust = dataSet[nonzero(initM[:,0].A==cent)[0]] #获得属于cent簇的数据
data.append(ptsInClust)
for cent, c, marker in zip( range(k), ['r', 'g', 'b', 'y'], ['^', 'o', '*', 's'] ): #画出数据点散点图
ax.scatter(array(data[cent][:, 0]), array(data[cent][:, 1]), s=80, c=c, marker=marker)
ax.scatter(array(center[:, 0]), array(center[:, 1]), s=1000, c='black', marker='+', alpha=1) #画出质心点
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
plt.show()
if __name__ == "__main__":
dataSet = loadDataSet('texts.txt')
print("请输入k:")
k=int(input())
#center, initM = KMeans(dataSet, k)
center, initM = biKmeans(dataSet, k) #从网上学的
print(center) #中心点
print(initM) #第一维表示距离哪个中心点最近,第二维表示距离最近中心点的距离
showCluster(dataSet, k, initM,center) #画图表示,从网上学的
数据:texts.txt
1 1
2 1
1 2
2 2
4 3
5 3
4 4
5 4
22万+

被折叠的 条评论
为什么被折叠?



