二分k均值原理及Python实现请见:
(1)http://blog.youkuaiyun.com/u013593585/article/details/51280052
(2)http://blog.youkuaiyun.com/u013593585/article/details/51263980 Python实现
C++代码:基于python的思路写的(求指点)
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <ctime>
using namespace std;
typedef vector<float> Tuple;
const int dims = 2;
const int k = 2;
const int bik = 4;
void doKmeans(vector<Tuple>& tuples,Tuple subcentroids[],vector<int>& SubclusterAssment1,Tuple& SubclusterAssment2);
void assignTuples(vector<Tuple> clusters[],vector<Tuple> tuples,Tuple means[]);
float getDist(const Tuple& t1, const Tuple& t2);
float getVal(vector<Tuple> clusters[], Tuple means[]);
Tuple updateMeans(const vector<Tuple>& cluster_i);
void print(vector<Tuple> clustes[]);
int main()
{
//open the file
char filename[] = "testSet.txt";
fstream file(filename);
if(!file)
{
cout << "can not opnen the file!" << endl;
return 0;
}
//read the data读取数据
vector<Tuple> tuples;
int pos = 0;
while(!file.eof())
{
string str;
getline(file,str);
stringstream ss(str);
Tuple tuple(dims+1,0); //the first element in tuple indicates the index
tuple[0] = pos + 1;
for(int i = 1; i <= dims; i++)
ss >> tuple[i];
tuples.push_back(tuple);
}
vector<Tuple> centroids; // generated centroids 生成的聚类中心
vector<int> clusterAssment1; // samples index in different centroids 样本对应的簇索引
Tuple clusterAssment2; // samples se 每个样本到其聚类中心的平方误差
//initialize the first centroid初始化第一个聚类中心(取样本均值)
Tuple centroid(dims+1,0);
for(int i = 0; i < tuples.size();i++)
{
for(int j = 1; j <= dims; j++)
centroid[j] = centroid[j] + tuples[i][j];
}
for(int j = 1; j <= dims; j++)
centroid[j] /= tuples.size();
centroids.push_back(centroid);
//initialize the clusterAssment1and clusterAssmen2 初始化样本簇索引为0及计算样本到聚类中心的距离</span>
for(int i = 0; i < tuples.size(); i++)
{
clusterAssment1.push_back(0);
clusterAssment2.push_back(getDist(tuples[i],centroid));
}
//inviding二分
int test = 0;
while(centroids.size() < bik) //是否已经得到bik个聚类中心
{
cout << "centroids.size: " << centroids.size() << endl;
//compute the SSE of each centroid in centroids 对于每个簇计算其SSE平方和误差 及 对该簇二聚类下的 样本总聚类代价很熟,</span><span style="font-family:Arial, Helvetica, sans-serif;">进而选择对哪个簇进行二聚类?
Tuple minSubcentroid[2]; //最终选择的簇的两个子簇的聚类中心
vector<int> minSubclusterAssment1; //最终选择的簇的两个子簇的聚类中心的簇索引
Tuple minSubclusterAssment2; //最终选择的簇的两个子簇到各自聚类中心的距离
int minIndex; //最终选择的簇索引
int minSSE = 10000;
//for each cluster i 对于每个聚类中心进行操作
for(int i = 0; i < centroids.size(); i++)
{
//find the subdataset belonging to cluster i //记录属于该聚类中心的样本
vector<Tuple> SubdataSet;
for(int j = 0; j < tuples.size();j++)
{
if (clusterAssment1[j] == i)
SubdataSet.push_back(tuples[j]);
}
//2means clustering //对该簇进行二聚类
Tuple Subcentroids[2];
vector<int> SubclusterAssment1;
Tuple SubclusterAssment2;
doKmeans(SubdataSet,Subcentroids,SubclusterAssment1,SubclusterAssment2);
//computer the sum SSE of cluster i //二聚类后,计算对应SSE 及总的聚类代价
float SSEi = 0;
float SSEother = 0;
float SSEsum = 0
//该簇聚类后的SSE
for (int x = 0; x < SubclusterAssment2.size(); x++)
SSEi += SubclusterAssment2[x];
//其他簇的SSE
for (int y = 0; y < tuples.size(); y++)
{
if (clusterAssment1[y] != i)
SSEother += clusterAssment2[y];
}
SSEsum = SSEi + SSEother;
//find the cluster who has the minimal SSE
if( SSEsum < minSSE)
{
minSubcentroid[0] = Subcentroids[0];
minSubcentroid[1] = Subcentroids[1];
for (int ii = 0; ii < SubclusterAssment1.size(); ii++)
{
minSubclusterAssment1.push_back(SubclusterAssment1[ii]);
minSubclusterAssment2.push_back(SubclusterAssment2[ii]);
}
minIndex = i;
minSSE = SSEsum;
}
}
//update the centroids <span style="font-family: Arial, Helvetica, sans-serif;">//更新聚类中心 </span>
int yuan_Size = centroids.size();
centroids[minIndex].swap(minSubcentroid[0]); //minIndex簇二聚类后索引为0的子簇的聚类中心取代该簇原来的聚类中心,
centroids.push_back(minSubcentroid[1]); //minIndex簇二聚类后索引为1的子簇的聚类中心添加到后面
cout << "yuan minSubclusterAssment1: " << endl;
for(int jj = 0; jj < minSubclusterAssment1.size(); jj++)
cout << minSubclusterAssment1[jj] << " ";
cout << endl;
//update the clusterAssment //更新样本的簇索引
//更新两个新簇的索引
for(int s = 0; s < minSubclusterAssment1.size(); s++)
{
int temp = minSubclusterAssment1[s];
if (temp == 1)
minSubclusterAssment1[s] = yuan_Size;
else
minSubclusterAssment1[s] = minIndex;
}
cout << "minIndex: " << minIndex << endl;
cout << "updated minSubclusterAssment1: " << endl;
for(int jj = 0; jj < minSubclusterAssment1.size(); jj++)
cout << minSubclusterAssment1[jj] << " ";
cout << endl;
//更新样本簇索引,将样本簇索引为minIndex的全部换成两个新子簇的索引
int count = 0;
for(int a = 0; a < clusterAssment1.size(); a++)
{
if(clusterAssment1[a] == minIndex)
{
clusterAssment1[a] = minSubclusterAssment1[count];
count++;
}
}
cout << "clusterAssment1: " << endl;
for(int jj = 0; jj < clusterAssment1.size(); jj++)
cout << clusterAssment1[jj] << " ";
cout << endl;
}
cout << "biKmenas is done!" << endl;
system("pause");
return 0;
}
void doKmeans(vector<Tuple>& tuples,Tuple subcentroids[],vector<int>& SubclusterAssment1,Tuple& SubclusterAssment2)
{
vector<Tuple> clusters[k];
Tuple means[k];
srand((unsigned)time(NULL));
for (int i = 0; i < k; i++)
{
int temp = rand()%tuples.size();
means[i] = tuples[temp];
}
assignTuples(clusters,tuples,means);
double newVal = getVal(clusters,means);
double oldVal = -1;
int t = 0;
while ((abs(newVal - oldVal)) > 1)
{
for (int i = 0; i < k; i++)
means[i] = updateMeans(clusters[i]);
oldVal = newVal;
newVal = getVal(clusters,means);
}
subcentroids[0] = means[0];
subcentroids[1] = means[1];
for(int i = 0; i < tuples.size();i++)
{
double dist0 = getDist(tuples[i],means[0]);
double dist1 = getDist(tuples[i],means[1]);
int label = 0;
int dist = dist0;
if (dist1 < dist0)
{
dist = dist1;
label = 1;
}
SubclusterAssment1.push_back(label);
SubclusterAssment2.push_back(dist);
}
}
void assignTuples(vector<Tuple> clusters[], vector<Tuple> tuples,Tuple means[])
{
for (int i = 0; i < tuples.size(); i++)
{
int label = 0;
double dist = getDist(tuples[i],means[0]);
for (int j = 1; j < k; j++)
{
double temp = getDist(tuples[i],means[j]);
if (temp < dist)
{
dist = temp;
label = j;
}
}
clusters[label].push_back(tuples[i]);
}
}
float getDist(const Tuple& t1, const Tuple& t2)
{
float sum = 0;
for (int i = 1; i <= dims; i++)
sum += (t1[i] - t2[i]) * (t1[i] - t2[i]);
return sum;
}
float getVal(vector<Tuple> clusters[], Tuple means[])
{
float val = 0;
for (int i = 0; i < k; i++)
{
vector<Tuple> t = clusters[i];
for (int j = 0; j < t.size(); j++)
val += getDist(t[j],means[i]);
}
return val;
}
Tuple updateMeans(const vector<Tuple>& cluster_i)
{
Tuple t(dims+1,0);
for (int i = 0; i < cluster_i.size(); i++)
for (int j = 1; j <= dims; j++)
t[j] += cluster_i[i][j];
for (int i = 1; i <= dims; i++)
t[i] /= cluster_i.size();
return t;
}
实现结果: