聚类算法-DBSCAN-C++优化算法
在聚类算法-DBSCAN-C++实现的基础上做了优化,优化前本人测试对900左右个点进行聚类,用时2000ms左右,经过优化后的算法用时在100ms左右,提升了近20倍。
优化前主要耗时的地方:
float squareDistance(point a,point b){
return sqrt((a.x-b.x)*(a.x-b.x)+(a.y-b.y)*(a.y-b.y));
}
这个函数涉及到平方和再开方,在做聚类时,每一次循环都在调用它,并且聚类算法中含有很多次循环,还有两层for循环嵌套的地方也在调用这个函数,所以计算起来很耗时。
优化思路:
- 减少
squareDistance(point a,point b)
的计算量:
假设有900个点,在优化前的算法中对这900个点中每两个点之间都会计算距离,因为有很多个for循环,所以做了很多次900个点的距离计算。优化的思路就是减少squareDistance(point a,point b)
的运算量,将每个点的x的平方和y的平方提前算出来,保存起来,用的时候直接取出来使用。 - 将for循环的局部变量变为全局变量,用空间换时间(对空间要求有限制的这里可以不改)
- 将i++换为++i
对于循环次数比较大的时候,++i会比i++省时间(具体原理可以参考++i和i++的具体实现)
代码实现:
#include <iostream>
#include <sstream>
#include <fstream>
#include <vector>
#include <ctime>
#include <cstdlib>
#include <limits>
#include <cmath>
#include <stack>
using namespace std;
int i;
int j;
float datax[1000];
float datay[1000];
float cdatax[1000];
float cdatay[1000];
class point{
public:
float x;
float y;
int cluster=0;
int pointType=1;//1 noise 2 border 3 core
int pts=0;//points in MinPts
vector<int> corepts;
int visited = 0;
point (){}
point (float a,float b,int c){
x = a;
y = b;
cluster = c;
pointType = 1;
pts = 0;
visited = 0
}
};
float stringToFloat(string i){
stringstream sf;
float score=0;
sf<<i;
sf>>score;
return score;
}
vector<point> openFile(const char* dataset){
fstream file;
file.open(dataset,ios::in);
if(!file)
{
cout <<"Open File Failed!" <<endl;
vector<point> a;
return a;
}
vector<point> data;
int i=1;
while(!file.eof()){
string temp;
file>>temp;
int split = temp.find(',',0);
point p(stringToFloat(temp.substr(0,split)),stringToFloat(temp.substr(split+1,temp.length()-1)),i++);
data.push_back(p);
}
file.close();
cout<<"successful!"<<endl;
return data;
}
/*
float squareDistance(point a,point b){
return sqrt((a.x-b.x)*(a.x-b.x)+(a.y-b.y)*(a.y-b.y));
}
*/
void DBSCAN(vector<point> dataset,float Epss,int MinPts){
float Eps = Epss*Epss;
int len = dataset.size();
vector<point> corePoint;
//calculate pts
cout<<"calculate pts"<<endl;
for(i=0;i<len;++i)
{
datax[i] = dataset[i].x * dataset[i].x;
datay[i] = dataset[i].y * dataset[i].y;
}
for(i=0;i<len;i++){
for(j=i+1;j<len;j++){
if((datax[i]+datay[i]+datax[j]+datay[j]-2 * dataset[i].x * dataset[j].x-2 * dataset[i].y * dataset[j].y)<Eps){
dataset[i].pts++;
dataset[j].pts++;
}
}
if(dataset[i].pts>=MinPts) {
dataset[i].pointType = 3;
corePoint.push_back(dataset[i]);
}
}
cout<<"joint core point"<<endl;
//joint core point
for(i=0;i<corePoint.size();++i){
cdatax[i] = corePoint[i].x * corePoint[i].x;
cdatay[i] = corePoint[i].y * corePoint[i].y;
}
for(i=0;i<corePoint.size();i++){
for(j=i+1;j<corePoint.size();j++){
if((cdatax[i]+cdatay[i]+cdatax[j]+cdatay[j]-2 * corePoint[i].x * corePoint[j].x-2 * corePoint[i].y * corePoint[j].y)<Eps){
corePoint[i].corepts.push_back(j);
corePoint[j].corepts.push_back(i);
}
}
}
for(i=0;i<corePoint.size();i++){
stack<point*> ps;
if(corePoint[i].visited == 1) continue;
ps.push(&corePoint[i]);
point *v;
while(!ps.empty()){
v = ps.top();
v->visited = 1;
ps.pop();
for(j=0;j<v->corepts.size();j++){
if(corePoint[v->corepts[j]].visited==1) continue;
corePoint[v->corepts[j]].cluster = corePoint[i].cluster;
corePoint[v->corepts[j]].visited = 1;
ps.push(&corePoint[v->corepts[j]]);
}
}
}
cout<<"border point,joint border point to core point"<<endl;
//border point,joint border point to core point
for(i=0;i<len;i++){
if(dataset[i].pointType==3) continue;
for(j=0;j<corePoint.size();j++){
if((datax[i]+datay[i]+cdatax[j]+cdatay[j]-2 * dataset[i].x * corePoint[j].x - 2 * dataset[i].y * corePoint[j].y) < Eps{
dataset[i].pointType = 2;
dataset[i].cluster = corePoint[j].cluster;
break;
}
}
}
cout<<"output"<<endl;
//output
fstream clustering;
clustering.open("clustering.txt",ios::out);
for(i=0;i<len;i++){
if(dataset[i].pointType == 2)
clustering<<dataset[i].x<<","<<dataset[i].y<<","<<dataset[i].cluster<<"\n";
}
for(i=0;i<corePoint.size();i++){
clustering<<corePoint[i].x<<","<<corePoint[i].y<<","<<corePoint[i].cluster<<"\n";
}
clustering.close();
}
int main(int argc, char** argv) {
vector<point> dataset = openFile("dataset3.txt");
DBSCAN(dataset,1.5,2);
return 0;
}
经过测试,优化后速度约为原来的20倍。
思路二(具体实现没有写,只是一个思路,也没有经过测试):
该聚类算法是根据每两个点之间的距离来做聚类的,也就是以某个点的为中心点,Eps为半径画圆,在圆内并满足其他条件则归为一类。而耗时就是在计算两点的距离上,所以可以将领域设置为正方形,这样就不用计算两点间的距离了,只用比较两个点的X值和Y值的差值小于正方形的边长便将它们归为一类。
但是这个方法可能会使聚类的准确度下降,具体的我也没测试过。
总之根据自己的要求选择合适的。