聚类算法之K均值算法C++实现

本文介绍了一种经典的聚类算法——K均值算法,并提供了详细的C++实现代码。通过实例展示了如何使用该算法对数据进行聚类,以及如何评估聚类效果。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

本算法为聚类算法之K均值算法,基本原理如下
K-均值聚类算法 
1.初始化:选择K个代表点
2.按欧式距离,把剩余点归入离自己最近的代表点的集合 
3.计算每个集合的平均值点,作为新的K个代表点
4.若本次代表点和上次代表点没有变化,则输出聚类结果,否则回到2. 


下面是C++实现源码,程序中认为各个点到自己集合代表点的距离平方和与上一次循环距离平方和相差在1以内即可认为代表点没有变化。

#include <iostream>  
#include <fstream>  
#include <vector>  
#include <math.h>  
#define k 3  
using namespace std;
//存放元组的属性信息  
struct Iris {
	float sepal_length;
	float sepal_width;
	float petal_length;
	float petal_width;
	int line;
};
//计算两个元组间的欧几里距离  
float getDistXY(Iris t1, Iris t2)
{
	return sqrt((t1.sepal_length - t2.sepal_length) * (t1.sepal_length - t2.sepal_length) + (t1.sepal_width - t2.sepal_width) * (t1.sepal_width - t2.sepal_width)+ (t1.petal_length - t2.petal_length) * (t1.petal_length - t2.petal_length) + (t1.sepal_width - t2.sepal_width) * (t1.petal_width - t2.petal_width));
}

//根据质心,决定当前元组属于哪个簇  
int clusterOfTuple(Iris means[], Iris tuple) {
	float dist = getDistXY(means[0], tuple);
	float tmp;
	int label = 0;//标示属于哪一个簇  
	for (int i = 1; i<k; i++) {
		tmp = getDistXY(means[i], tuple);
		if (tmp<dist) { dist = tmp; label = i; }
	}
	return label;
}
//获得给定簇集的平方误差  
float getVar(vector<Iris> clusters[], Iris means[]) {
	float var = 0;
	for (int i = 0; i < k; i++)
	{
		vector<Iris> t = clusters[i];
		for (int j = 0; j< t.size(); j++)
		{
			var += getDistXY(t[j], means[i]);
		}
	}
	//cout<<"sum:"<<sum<<endl;  
	return var;

}
//获得当前簇的均值(质心)  
Iris getMeans(vector<Iris> cluster) {

	int num = cluster.size();
	double meansX = 0, meansY = 0, meansM = 0, meansN = 0;
	Iris t;
	for (int i = 0; i < num; i++)
	{
		meansX += cluster[i].sepal_length;
		meansY += cluster[i].sepal_width;
		meansM += cluster[i].petal_length;
		meansN += cluster[i].petal_width;
	}
	t.sepal_length = meansX / num;
	t.sepal_width = meansY / num;
	t.petal_length = meansM / num;
	t.petal_width = meansN / num;
	return t;
	//cout<<"sum:"<<sum<<endl;  


}
void output(vector<Iris> clusters[k])
{
	ofstream fout;
	fout.open("dataout.txt", ios_base::out);
	if (fout.is_open())
	{
		for (int j = 0; j < 3; j++)
		{
			fout << "++++++++++++++++++++" << endl;
			for (decltype(clusters[j].size()) i = 0; i != clusters[j].size(); i++)
			{
				fout << clusters[j][i].sepal_length<<",";
				fout << clusters[j][i].sepal_width << ",";
				fout << clusters[j][i].petal_length << ",";
				fout << clusters[j][i].petal_width<<","<< clusters[j][i].line;
				fout << endl;
			}
		}
		

	}
	fout.close();
}

void KMeans(vector<Iris> tuples) {
	vector<Iris> clusters[k];
	Iris means[k];
	int i = 0;
	//默认一开始将前K个元组的值作为k个簇的质心(均值)  
	for (i = 0; i<k; i++) {
		means[i].sepal_length = tuples[i].sepal_length;
		means[i].sepal_width = tuples[i].sepal_width;
		means[i].petal_length = tuples[i].petal_length;
		means[i].petal_width = tuples[i].petal_width;
	}
	int lable = 0;
	//根据默认的质心给簇赋值  
	for (i = 0; i != tuples.size(); ++i) {
		lable = clusterOfTuple(means, tuples[i]);
		clusters[lable].push_back(tuples[i]);
	}
/*	//输出刚开始的簇  
	for (lable = 0; lable<3; lable++) {
		cout << "第" << lable + 1 << "个簇:" << endl;
		vector<Iris> t = clusters[lable];
		for (i = 0; i< t.size(); i++)
		{
			cout << "(" << t[i].attr1 << "," << t[i].attr2 << ")" << "   ";
		}
		cout << endl;
	}*/
	float oldVar = -1;
	float newVar = getVar(clusters, means);
	while (abs(newVar - oldVar) >= 1) //当新旧函数值相差不到1即准则函数值不发生明显变化时,算法终止  
	{

		for (i = 0; i < k; i++) //更新每个簇的中心点  
		{
			means[i] = getMeans(clusters[i]);
			//cout<<"means["<<i<<"]:"<<means[i].attr1<<"  "<<means[i].attr2<<endl;  
		}
		oldVar = newVar;
		newVar = getVar(clusters, means); //计算新的准则函数值  
		for (i = 0; i < k; i++) //清空每个簇  
		{
			clusters[i].clear();
		}
		//根据新的质心获得新的簇  
		for (i = 0; i != tuples.size(); ++i) {
			lable = clusterOfTuple(means, tuples[i]);
			clusters[lable].push_back(tuples[i]);
		}
	}
/*	//输出当前的簇  
	int count = 0;
	for (lable = 0; lable<3; lable++) {
		cout << "第" << lable + 1 << "个簇:" << endl;
		vector<Iris> t = clusters[lable];
		for (i = 0; i< t.size(); i++)
		{
			count++;
			cout << "(" << t[i].sepal_length << "," << t[i].sepal_width << "," << t[i].petal_length << "," << t[i].petal_width << ")" << "   " << count << endl;
		}
		cout << endl;
	}
	*/
	output(clusters);
}

int main() {

	char fname[256]="data.txt";
	int line = 1;
	ifstream infile;
	infile.open(fname, ios::in);
	if (!infile) {
		cout << "不能打开输入的文件" << fname << endl;
		return 0;
	}
	int count = 0;
	vector<Iris> tuples;
	Iris tuple;
	//从文件流中读入数据  
	while (!infile.eof()) {
		count++;
		switch (count % 4)
		{
		case 1:infile >> tuple.sepal_length; break;
		case 2:infile >> tuple.sepal_width; break;
		case 3:infile >> tuple.petal_length; break;
		default:
			infile >> tuple.petal_width;
			tuple.line = line;
			line++;
			tuples.push_back(tuple);
			break;
		}
	}
	//int k;  
	//cout<<"请输入期望的簇的个数:"  
	//cin>>k;  
	//cout<<endl;  

	//输出文件中的元组信息  
//	for (vector<Iris>::size_type ix = 0; ix != tuples.size(); ++ix)
	//	cout << "(" << tuples[ix].attr1 << "," << tuples[ix].attr2 << ")" << "    ";
//	cout << endl;
	KMeans(tuples);
	system("pause");
	return 0;
}
测试数据,存入data.txt.本测试数据1-50是一类,51-100是一类,101-150是一类
5.1 3.5 1.4 0.2 
4.9 3.0 1.4 0.2 
4.7 3.2 1.3 0.2 
4.6 3.1 1.5 0.2 
5.0 3.6 1.4 0.2 
5.4 3.9 1.7 0.4 
4.6 3.4 1.4 0.3 
5.0 3.4 1.5 0.2 
4.4 2.9 1.4 0.2 
4.9 3.1 1.5 0.1 
5.4 3.7 1.5 0.2 
4.8 3.4 1.6 0.2 
4.8 3.0 1.4 0.1 
4.3 3.0 1.1 0.1 
5.8 4.0 1.2 0.2 
5.7 4.4 1.5 0.4 
5.4 3.9 1.3 0.4 
5.1 3.5 1.4 0.3 
5.7 3.8 1.7 0.3 
5.1 3.8 1.5 0.3 
5.4 3.4 1.7 0.2 
5.1 3.7 1.5 0.4 
4.6 3.6 1.0 0.2 
5.1 3.3 1.7 0.5 
4.8 3.4 1.9 0.2 
5.0 3.0 1.6 0.2 
5.0 3.4 1.6 0.4 
5.2 3.5 1.5 0.2 
5.2 3.4 1.4 0.2 
4.7 3.2 1.6 0.2 
4.8 3.1 1.6 0.2 
5.4 3.4 1.5 0.4 
5.2 4.1 1.5 0.1 
5.5 4.2 1.4 0.2 
4.9 3.1 1.5 0.2 
5.0 3.2 1.2 0.2 
5.5 3.5 1.3 0.2 
4.9 3.6 1.4 0.1 
4.4 3.0 1.3 0.2 
5.1 3.4 1.5 0.2 
5.0 3.5 1.3 0.3 
4.5 2.3 1.3 0.3 
4.4 3.2 1.3 0.2 
5.0 3.5 1.6 0.6 
5.1 3.8 1.9 0.4 
4.8 3.0 1.4 0.3 
5.1 3.8 1.6 0.2 
4.6 3.2 1.4 0.2 
5.3 3.7 1.5 0.2 
5.0 3.3 1.4 0.2 
7.0 3.2 4.7 1.4 
6.4 3.2 4.5 1.5 
6.9 3.1 4.9 1.5 
5.5 2.3 4.0 1.3 
6.5 2.8 4.6 1.5 
5.7 2.8 4.5 1.3 
6.3 3.3 4.7 1.6 
4.9 2.4 3.3 1.0 
6.6 2.9 4.6 1.3 
5.2 2.7 3.9 1.4 
5.0 2.0 3.5 1.0 
5.9 3.0 4.2 1.5 
6.0 2.2 4.0 1.0 
6.1 2.9 4.7 1.4 
5.6 2.9 3.6 1.3 
6.7 3.1 4.4 1.4 
5.6 3.0 4.5 1.5 
5.8 2.7 4.1 1.0 
6.2 2.2 4.5 1.5 
5.6 2.5 3.9 1.1 
5.9 3.2 4.8 1.8 
6.1 2.8 4.0 1.3 
6.3 2.5 4.9 1.5 
6.1 2.8 4.7 1.2 
6.4 2.9 4.3 1.3 
6.6 3.0 4.4 1.4 
6.8 2.8 4.8 1.4 
6.7 3.0 5.0 1.7 
6.0 2.9 4.5 1.5 
5.7 2.6 3.5 1.0 
5.5 2.4 3.8 1.1 
5.5 2.4 3.7 1.0 
5.8 2.7 3.9 1.2 
6.0 2.7 5.1 1.6 
5.4 3.0 4.5 1.5 
6.0 3.4 4.5 1.6 
6.7 3.1 4.7 1.5 
6.3 2.3 4.4 1.3 
5.6 3.0 4.1 1.3 
5.5 2.5 4.0 1.3 
5.5 2.6 4.4 1.2 
6.1 3.0 4.6 1.4 
5.8 2.6 4.0 1.2 
5.0 2.3 3.3 1.0 
5.6 2.7 4.2 1.3 
5.7 3.0 4.2 1.2 
5.7 2.9 4.2 1.3 
6.2 2.9 4.3 1.3 
5.1 2.5 3.0 1.1 
5.7 2.8 4.1 1.3 
6.3 3.3 6.0 2.5 
5.8 2.7 5.1 1.9 
7.1 3.0 5.9 2.1 
6.3 2.9 5.6 1.8 
6.5 3.0 5.8 2.2 
7.6 3.0 6.6 2.1 
4.9 2.5 4.5 1.7 
7.3 2.9 6.3 1.8 
6.7 2.5 5.8 1.8 
7.2 3.6 6.1 2.5 
6.5 3.2 5.1 2.0 
6.4 2.7 5.3 1.9 
6.8 3.0 5.5 2.1 
5.7 2.5 5.0 2.0 
5.8 2.8 5.1 2.4 
6.4 3.2 5.3 2.3 
6.5 3.0 5.5 1.8 
7.7 3.8 6.7 2.2 
7.7 2.6 6.9 2.3 
6.0 2.2 5.0 1.5 
6.9 3.2 5.7 2.3 
5.6 2.8 4.9 2.0 
7.7 2.8 6.7 2.0 
6.3 2.7 4.9 1.8 
6.7 3.3 5.7 2.1 
7.2 3.2 6.0 1.8 
6.2 2.8 4.8 1.8 
6.1 3.0 4.9 1.8 
6.4 2.8 5.6 2.1 
7.2 3.0 5.8 1.6 
7.4 2.8 6.1 1.9 
7.9 3.8 6.4 2.0 
6.4 2.8 5.6 2.2 
6.3 2.8 5.1 1.5 
6.1 2.6 5.6 1.4 
7.7 3.0 6.1 2.3 
6.3 3.4 5.6 2.4 
6.4 3.1 5.5 1.8 
6.0 3.0 4.8 1.8 
6.9 3.1 5.4 2.1 
6.7 3.1 5.6 2.4 
6.9 3.1 5.1 2.3 
5.8 2.7 5.1 1.9 
6.8 3.2 5.9 2.3 
6.7 3.3 5.7 2.5 
6.7 3.0 5.2 2.3 
6.3 2.5 5.0 1.9 
6.5 3.0 5.2 2.0 
6.2 3.4 5.4 2.3 
5.9 3.0 5.1 1.8 

结果会写入dataout.txt。同时会在每一行数据后面写上该数据在原来数据的哪一行方便大家观察
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值