ISODATA源代码
作者:liangdas
出处:简单点儿,通俗点儿,机器学习 http://blog.youkuaiyun.com/liangdas/article/details/39809845
下面是我写的ISODATA的源代码,分别有三个文件,一个是ISODATA.h头文件,一个是ISODATA.c文件,另外一个是Main.cpp文件。ISODATA.h和ISODATA.c文件中引用了系统的stdio.h,stdlib.h, math.h文件。
这个书写风格和前面的K-Means类似。
后面的main.cpp是介绍怎么使用的,输入是按txt格式存贮的,测试数据的存贮格式和前面的K-Means也类似:
sample number(样本总数)
feature number(特征维数)
intend class number(待分类的类别)
initial class center index(初始类别中心)
feature list as(特征列表):
feature1 feature2 ...
feature1 feature2 ...
......
当然可以自己定义数据的格式,并重先写LoadPatterns()函数。
ISODATA.h
/***********************************
* Author: liangdas
* Time: 20140924
* Version: 0_20140924
* Contact:
* QQ: 358536026 Email: liangdas1986@163.com
* Working place: Beijing Samsuang Telecom R&D Center
************************************/
#ifndef __ISODATA_H__
#define __ISODATA_H__
#ifdef __cplusplus
extern "C"{
#endif
#define SUCCESS 1
#define FAILURE 0
#define MAX_SAMPLES 1000 //最大样本个数
#define MAX_CLUSTER_NUM 40 //最大类别数(这个值设定成大于预期类数数目的2倍以上)
#define MAX_DIM 10 //最大样本维数
#define h 0.5 //分裂时使用的比值
#define MAXDOUBLE 1.0e20 //最大双精度值
#define DIM 2 //实际样本维数
typedef struct stTwoClusterDist
{
double dist;
int nIndexI;
int nIndexJ;
}TWONEARCLUSTER, *PTWONEARCLUSTER;
typedef struct stCluster
{
double Center[MAX_DIM]; //样本数据
int pMemberIndex[MAX_SAMPLES]; //只想整个数据集的索引号
int nSampleNum;
double nAveDistToCenter;
double fDeltaOfFeature[MAX_DIM];//分量的标准差
int nMaxDeltaOfFeatureIndex; //用于记录类内距离标准差矢量最大的分量下标
}CLASSCLUSTER, *PCLASSCLUSTER;
/********************************************************
* Function: LoadPatterns()
* Descrption: 通过文件名字,加载样本列表
* Input&OutPut:
* Return:
* File format: 样本数 特征维数 待分类类别数目 样本。。。
*********************************************************/
int LoadPatterns(char *fname, double** pSamples, int* pNumSamples, int* pClusterNum, int* pNumDim, int* pOrgCenterIndex);
/***************************************************************
* Function: InitClusters()
* Description: 指定初始类别中心,这个函数取的是样本序列的前nCurClusterNum样本作为聚类的初始类别中心
* Input&Output:
* Returns:
****************************************************************/
void InitClusters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim, int* pCenterIndex);
/***************************************************************
* Function: CalcuDistance()
* Description: 样本序号sampleID,到第clusterID个类别的距离
* Input&Output:
* Returns:
****************************************************************/
double CalcuDistance(int sampleID, int clusterID, double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: FindClosestCluster()
* Description: 找到样本序号为sampleID的样本,所属的类别
* Input&Output:
* Returns:
****************************************************************/
int FindClosestCluster(int sampleID, double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: ReClassify()
* Description: 重新计算样本所属的类别
* Input&Output:
* Returns:
****************************************************************/
void ReClassify(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: ReClassify()
* Description: 依据θN判断合并,若类nSampleNum中样本数小于θN,
//这里只更新类别中心,因为调用这个函数的后面就是重新根据类别中心聚类
* Input&Output:
* Returns:
****************************************************************/
short RemoveCenterWithLessNum(PCLASSCLUSTER pCluster, int* pClusterNum, int nThrelNum, int nNumDim);
/***************************************************************
* Function: CalcNewClustCenters()
* Description: 重新计算类别中心
* Input&Output:
* Returns:
****************************************************************/
int CalcNewClustCenters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: CalAveDistInCluster()
* Description: 计算每个类别内部的样本到类别中心的平均距离, 和总体的平均距离
* Input&Output:
* Returns:
****************************************************************/
double CalAveDistInCluster(PCLASSCLUSTER pCluster, int nClusterNum, double** pSamples, int nNumSamples, int nNumDim);
/***************************************************************
* Function: CalSigmaInCluster()
* Description: 计算每一个类别中,每一维分量的标准差
* Input&Output:
* Returns:
****************************************************************/
void CalDimSigmaInCluster(PCLASSCLUSTER pCluster, int nClusterNum, double** pSamples, int nNumSamples, int nNumDim);
/***************************************************************
* Function: CalAveDistBetween2Centers()
* Description: 计算所有类别中,两两之间的距离
* Input&Output:
* Returns:
****************************************************************/
void CalAveDistBetween2Centers(PCLASSCLUSTER pCluster, int nClusterNum, int nNumDim, double** pDistBetTwoClusters);
/***************************************************************
* Function: DivideClusters()
* Description: 分裂类别,每次只分裂一次,而且是碰到了满足条件的类别就分裂
* Input&Output:
* Returns:
****************************************************************/
int DivideClusters(PCLASSCLUSTER pCluster, int* pClusterNum, int nNumDim, double dAveTotalCluster, double SIGMA_THRELD, int MAX_SAMPLES_ONE_CLUSTER, int EXEPECT_CLUSTER_NUM, int MIN_SAMPLES_ONE_CLUSTER);
/***************************************************************
* Function: UnionByLessDistBetwCenter()
* Description: 合并操作,合并的条件是两个类的类别中心距离很近,
* Input&Output:
* Returns:
****************************************************************/
int UnionByLessDistBetwCenter(PCLASSCLUSTER pCluster, int* pClusterNum, int nNumDim, int MIN_CLUSTER_DIST, int MERGE_CLUSTER_NUM, double** pDistBetTwoClusters);
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类中心的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveCenters(char* pFilePath, PCLASSCLUSTER Cluster, int nCurClusterNum, int NumDim);
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类的样本的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveClusters(char* pFilePath, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int nCurClusterNum, int NumDim);
/***************************************************************
* Function: UnionByLessDistBetwCenter()
* Description: 合并操作,合并的条件是两个类的类别中心距离很近,
* Input&Output:
* Returns:
****************************************************************/
void RunISODATA(double** pSamples, int nNumSamples, PCLASSCLUSTER pCluster, int nNumDim, int* pCenterIndex, int* nCurClusterNum);
#ifdef __cplusplus
}
#endif
#endif
ISODATA.c
/***********************************
* Author: liangdas
* Time: 20140924
* Version: 0_20140924
* Contact:
* QQ: 358536026 Email: liangdas1986@163.com
* Working place:Beijing Samsuang Telecom R&D Center
************************************/
#include "ISODATA.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __cplusplus
extern "C"{
#endif
/********************************************************
* Function: LoadPatterns()
* Descrption: 通过文件名字,加载样本列表
* Input&OutPut:
* Return:
* File format: 样本数 特征维数 待分类类别数目 样本。。。
*********************************************************/
int LoadPatterns(char* fname, double** pSamples, int* pNumSamples, int* pClusterNum, int* pNumDim, int* pOrgCenterIndex)
{
FILE* InFilePtr;
int i,j;
double x;
if((InFilePtr = fopen(fname, "rt")) == NULL)
{
return FAILURE;
}
fscanf(InFilePtr, "%d", pNumSamples);
fscanf(InFilePtr, "%d", pNumDim);
fscanf(InFilePtr, "%d", pClusterNum);
for(i=0; i<*pClusterNum; i++)
{
fscanf(InFilePtr, "%d ", &pOrgCenterIndex[i]);
}
for (i=0; i<*pNumSamples; i++)
{
for (j=0; j<*pNumDim; j++)
{
fscanf(InFilePtr, "%lg", &x);
pSamples[i][j] = x;
}
}
return SUCCESS;
}
/***************************************************************
* Function: InitClusters()
* Description: 指定初始类别中心,这个函数取的是样本序列的前nCurClusterNum样本作为聚类的初始类别中心
* Input&Output:
* Returns:
****************************************************************/
void InitClusters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim, int* pCenterIndex)
{
int i, j;
printf("Initial cluster centers:\n");
if(nCurClusterNum > NumSamples)
{
printf("class number exceed to sample number\n");
}
for (i=0; i<nCurClusterNum; i++)
{
//pCluster[i].pMemberIndex[0] = pCenterIndex[i];
for (j=0; j<NumDim; j++)
{
pCluster[i].Center[j] = pSamples[pCenterIndex[i]][j];
pCluster[i].fDeltaOfFeature[j] = 0;
}
//init member index
for(j=0; j< MAX_SAMPLES; j++)
{
pCluster[i].pMemberIndex[j] = 0;
}
pCluster[i].nSampleNum = 0;
pCluster[i].nMaxDeltaOfFeatureIndex = 0;
} /* endfor */
for (i=0; i<nCurClusterNum; i++)
{
printf("ClusterCenter[%d]=(%f,%f)\n", i, pCluster[i].Center[0], pCluster[i].Center[1]);
} /* endfor */
printf("\n");
}
/***************************************************************
* Function: CalcuDistance()
* Description: 样本序号sampleID,到第clusterID个类别的距离
* Input&Output:
* Returns:
****************************************************************/
double CalcuDistance(int sampleID, int clusterID, double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim)
{
double dist,x; // between pattern vector, sampleID, and cluster
int i; // center, clusterID.
dist = 0;
for (i=0; i<NumDim; i++)
{
x = (pCluster[clusterID].Center[i]-pSamples[sampleID][i])*(pCluster[clusterID].Center[i]-pSamples[sampleID][i]);
dist += sqrt(x);
}
return dist;
}
/***************************************************************
* Function: FindClosestCluster()
* Description: 找到样本序号为sampleID的样本,所属的类别
* Input&Output:
* Returns:
****************************************************************/
int FindClosestCluster(int sampleID, double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim)
{
int i, ClustID;
double MinDist, d;
MinDist =9.9e+99;
ClustID=-1;
for (i=0; i<nCurClusterNum; i++)
{
d=CalcuDistance(sampleID, i, pSamples, NumSamples, pCluster, nCurClusterNum, NumDim);
if (d<MinDist)
{
MinDist=d;
ClustID=i;
}
}
if (ClustID<0)
{
//printf("Aaargh");
exit(0);
} /* endif */
return ClustID;
}
/***************************************************************
* Function: ReClassify()
* Description: 重新计算样本所属的类别
* Input&Output:
* Returns:
****************************************************************/
void ReClassify(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim)
{
int i,sampleID,Clustid,pMemberIndex;
//Clear membership list for all current clusters
for (i=0; i<nCurClusterNum; i++)
{
pCluster[i].nSampleNum = 0;
}
for (sampleID=0; sampleID<NumSamples; sampleID++)
{
//Find cluster center to which the pattern is closest
Clustid= FindClosestCluster(sampleID, pSamples, NumSamples, pCluster, nCurClusterNum, NumDim);
pMemberIndex=pCluster[Clustid].nSampleNum;
pCluster[Clustid].pMemberIndex[pMemberIndex]=sampleID;
pCluster[Clustid].nSampleNum++;
} /* endfor */
}
/***************************************************************
* Function: ReClassify()
* Description: 依据θN判断合并,若类nSampleNum中样本数小于θN,
//这里只更新类别中心,因为调用这个函数的后面就是重新根据类别中心聚类
* Input&Output:
* Returns:
****************************************************************/
short RemoveCenterWithLessNum(PCLASSCLUSTER pCluster, int* pClusterNum, int nThrelNum, int nNumDim)
{
int i=0, j = 0, k = 0, nRemoveNum = 0;
short nRet = 0;
for(i=0; i<*pClusterNum; i++)
{
if(pCluster[i].nSampleNum < nThrelNum)
{
for(j=i; j<*pClusterNum; j++)
{
for(k=0; k<nNumDim; k++)
{
pCluster[j].Center[k] = pCluster[j+1].Center[k];
}
}
nRemoveNum++;
nRet = 1;
}
}
*pClusterNum = *pClusterNum - nRemoveNum;
return nRet;
}
/***************************************************************
* Function: CalcNewClustCenters()
* Description: 重新计算类别中心
* Input&Output:
* Returns:
****************************************************************/
int CalcNewClustCenters(double** pSamples, int NumSamples, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim)
{
int nSampleID,i,j,k;
double tmp[MAX_DIM];
for (i=0; i<nCurClusterNum; i++)
{
for(j=0; j<NumDim; j++)
{
tmp[j]=0.0;
}
for(j=0; j<pCluster[i].nSampleNum; j++)
{
nSampleID = pCluster[i].pMemberIndex[j];
for (k=0; k<NumDim; k++)
{
tmp[k] += pSamples[nSampleID][k];
}
}
for (k=0; k<NumDim; k++)
{
tmp[k] = tmp[k]/pCluster[i].nSampleNum;
pCluster[i].Center[k]=tmp[k];
}
}
return 0;
}
/***************************************************************
* Function: CalAveDistInCluster()
* Description: 计算每个类别内部的样本到类别中心的平均距离, 和总体的平均距离
* Input&Output:
* Returns:
****************************************************************/
double CalAveDistInCluster(PCLASSCLUSTER pCluster, int nCurClusterNum, double** pSamples, int nNumSamples, int nNumDim)
{
double nAveDist = 0.0;
int i = 0, j = 0;
double dAveTotalCluster = 0.0;
for(i=0; i<nCurClusterNum; i++)
{
nAveDist = 0.0;
for(j=0; j<pCluster[i].nSampleNum; j++)
{
nAveDist += CalcuDistance(pCluster[i].pMemberIndex[j], i, pSamples, nNumSamples, pCluster, nCurClusterNum, nNumDim);
}
pCluster[i].nAveDistToCenter = nAveDist/pCluster[i].nSampleNum;
dAveTotalCluster += nAveDist/pCluster[i].nSampleNum;
}
dAveTotalCluster /= nCurClusterNum;
return dAveTotalCluster;
}
/***************************************************************
* Function: CalSigmaInCluster()
* Description: 计算每一个类别中,每一维分量的标准差
* Input&Output:
* Returns:
****************************************************************/
void CalDimSigmaInCluster(PCLASSCLUSTER pCluster, int nCurClusterNum, double** pSamples, int nNumSamples, int nNumDim)
{
double nAveDist = 0.0;
double sigma_max = 0.0;
int i = 0, j = 0, k = 0;
for(i=0; i<nCurClusterNum; i++)
{
sigma_max = 0.0;
for(j=0; j<nNumDim; j++)
{
nAveDist = 0.0;
for(k=0; k<pCluster[i].nSampleNum; k++)
{
nAveDist += (pSamples[pCluster[i].pMemberIndex[k]][j]-pCluster[i].Center[j]) *
(pSamples[pCluster[i].pMemberIndex[k]][j]-pCluster[i].Center[j]);
}
pCluster[i].fDeltaOfFeature[j] = sqrt(nAveDist/pCluster[i].nSampleNum);
if(sigma_max < pCluster[i].fDeltaOfFeature[j])
{
sigma_max = pCluster[i].fDeltaOfFeature[j];
pCluster[i].nMaxDeltaOfFeatureIndex = j;
}
}
}
return;
}
/***************************************************************
* Function: CalAveDistBetween2Centers()
* Description: 计算所有类别中,两两之间的距离
* Input&Output:
* Returns:
****************************************************************/
void CalAveDistBetween2Centers(PCLASSCLUSTER pCluster, int nCurClusterNum, int nNumDim, double** pDistBetTwoClusters)
{
int i = 0,j = 0, k = 0;
double Dist = 0.0;
for(i=0;i<nCurClusterNum-1;i++)
{
for(j=i+1;j<nCurClusterNum;j++)
{
Dist = 0.0;
for(k=0; k<nNumDim; k++)
{
Dist += (pCluster[i].Center[k]-pCluster[j].Center[k])*(pCluster[i].Center[k]-pCluster[j].Center[k]);
}
pDistBetTwoClusters[i][j] = sqrt(Dist);
}
}
return;
}
/***************************************************************
* Function: DivideClusters()
* Description: 分裂类别,每次只分裂一次,而且是碰到了满足条件的类别就分裂
* Input&Output:
* Returns:
****************************************************************/
int DivideClusters(PCLASSCLUSTER pCluster, int* pClusterNum, int nNumDim, double dAveTotalCluster, double SIGMA_THRELD, int MAX_SAMPLES_ONE_CLUSTER, int EXEPECT_CLUSTER_NUM, int MIN_SAMPLES_ONE_CLUSTER)
{
int i, j, l, k = 0;
for(i=0; i<*pClusterNum; i++)
{
double sigma_temp = pCluster[i].fDeltaOfFeature[pCluster[i].nMaxDeltaOfFeatureIndex];
if(sigma_temp >= SIGMA_THRELD)
{
if((pCluster[i].nAveDistToCenter > dAveTotalCluster)
&&(pCluster[i].nSampleNum > 2*(MIN_SAMPLES_ONE_CLUSTER+1))
||(*pClusterNum <= EXEPECT_CLUSTER_NUM/2))
{
j = pCluster[i].nMaxDeltaOfFeatureIndex;
for(l=*pClusterNum; l>i; l--)
{
for(k=0; k<nNumDim; k++)
{
pCluster[l].Center[k] = pCluster[l-1].Center[k];
}
}
pCluster[i+1].Center[j] -= h*sigma_temp;
pCluster[i].Center[j] += h*sigma_temp;
(*pClusterNum)++;
return 1;
}
}
}
return 0;
}
/***************************************************************
* Function: UnionByLessDistBetwCenter()
* Description: 合并操作,合并的条件是两个类的类别中心距离很近,
* Input&Output:
* Returns:
****************************************************************/
int UnionByLessDistBetwCenter(PCLASSCLUSTER pCluster, int* pClusterNum, int nNumDim, double MIN_CLUSTER_DIST,
int MERGE_CLUSTER_NUM, double** pDistBetTwoClusters)
{
int i = 0, j = 0, k = 0, l = 0;
int num=0;
int flag=0;
int nTmpClusterNum = *pClusterNum;
PTWONEARCLUSTER pTwoNearCluster = (PTWONEARCLUSTER)malloc(sizeof(TWONEARCLUSTER)*MERGE_CLUSTER_NUM);
for(i=0; i<MERGE_CLUSTER_NUM; i++)
{
pTwoNearCluster[i].dist = MIN_CLUSTER_DIST;
pTwoNearCluster[i].nIndexI = -1;
pTwoNearCluster[i].nIndexJ = -1;
}
//找到MERGE_CLUSTER_NUM个类别之间的距离小于MIN_CLUSTER_DIST的类别组
for(i=0; i<nTmpClusterNum-1; i++)
{
for(j=i+1;j<nTmpClusterNum;j++)
{
if(pDistBetTwoClusters[i][j] < MIN_CLUSTER_DIST)
{
//for(k=0; k<MERGE_CLUSTER_NUM; k++)
//{
if(pDistBetTwoClusters[i][j]<pTwoNearCluster[k].dist && k<MERGE_CLUSTER_NUM)
{
#if 0
for(l=MERGE_CLUSTER_NUM-1; l>k; l--)
{
pTwoNearCluster[l] = pTwoNearCluster[l-1];
}
#endif
pTwoNearCluster[k].dist = pDistBetTwoClusters[i][j];
pTwoNearCluster[k].nIndexI = i;
pTwoNearCluster[k].nIndexJ = j;
k++;
break;
}
//}
}
}
}
k = (k<MERGE_CLUSTER_NUM)?k:MERGE_CLUSTER_NUM;
//从小到大排列pTwoNearCluster中的值
for(i=0; i<k-1; i++)
{
for(j=0; j<k; j++)
{
if(pTwoNearCluster[i].dist>pTwoNearCluster[j].dist)
{
double tmpval = pTwoNearCluster[i].dist;
int tmpIndexI = pTwoNearCluster[i].nIndexI;
int tmpIndexJ = pTwoNearCluster[i].nIndexJ;
pTwoNearCluster[i].dist = pTwoNearCluster[j].dist;
pTwoNearCluster[i].nIndexI = pTwoNearCluster[j].nIndexI;
pTwoNearCluster[i].nIndexJ = pTwoNearCluster[j].nIndexJ;
pTwoNearCluster[j].dist = tmpval;
pTwoNearCluster[j].nIndexI = tmpIndexI;
pTwoNearCluster[j].nIndexJ = tmpIndexJ;
}
}
}
//合并类别
for(i=0; i<k; i++)
{
int nIndexI = pTwoNearCluster[i].nIndexI;
int nIndexJ = pTwoNearCluster[i].nIndexJ;
if(nIndexI>-1 && nIndexJ>-1)
{
if((pCluster[nIndexI].nSampleNum<0)||(pCluster[nIndexJ].nSampleNum<0))
{
continue;
}
for(j=0; j<nNumDim;j++)
{
pCluster[nIndexI].Center[j] = (pCluster[nIndexI].Center[j]*pCluster[nIndexI].nSampleNum +pCluster[nIndexJ].Center[j]*pCluster[nIndexJ].nSampleNum)
/(pCluster[nIndexI].nSampleNum+pCluster[nIndexJ].nSampleNum);
}
pCluster[nIndexI].nSampleNum = -1;
pCluster[nIndexJ].nSampleNum = -2;
}
}
//去掉合并之后,那些不要的类别中心
for(i=0; i<nTmpClusterNum; i++)
{
if(pCluster[i].nSampleNum == -2)
{
for(j=i; j<nTmpClusterNum; j++)
{
for(k=0; k<nNumDim; k++)
{
pCluster[j].Center[k] = pCluster[j+1].Center[k];
}
}
nTmpClusterNum--;
}
}
*pClusterNum = nTmpClusterNum;
return 0;
}
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类中心的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveCenters(char* pFilePath, PCLASSCLUSTER pCluster, int nCurClusterNum, int NumDim)
{
int i,j;
FILE* fpResultFile;
fpResultFile = fopen(pFilePath, "wt");
if(fpResultFile == 0)
{
printf("open file %s error\n", pFilePath);
return;
}
fprintf(fpResultFile, "x\ty\tlabel\t\n");
for (i=0; i<nCurClusterNum; i++)
{
pCluster[i].pMemberIndex[0] = i;
for(j=0; j<NumDim; j++)
{
fprintf(fpResultFile, "%f\t", pCluster[i].Center[j]);
}
fprintf(fpResultFile, "%d\n", i);
} /* endfor */
fclose(fpResultFile);
}
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveClusters(char* pFilePath, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int nCurClusterNum, int NumDim)
{
FILE* fpResultFile = 0;
int i = 0, j = 0, k = 0;
int nSampleID = 0;
fpResultFile = fopen(pFilePath, "wt");
if(fpResultFile == 0)
{
printf("open file %s error\n", pFilePath);
return;
}
fprintf(fpResultFile, "x\ty\tlabel\t\n");
for (i=0; i<nCurClusterNum; i++)
{
for(j=0; j<Cluster[i].nSampleNum; j++)
{
nSampleID = Cluster[i].pMemberIndex[j];
for(k=0; k<NumDim; k++)
{
fprintf(fpResultFile, "%hg\t", Pattern[nSampleID][k]);
}
fprintf(fpResultFile, "%d\n", i); //所属的类别
}
} /* endfor */
fclose(fpResultFile);
}
/***************************************************************
* Function: RunISODATA()
* Description:
* Input&Output:
* Returns:
****************************************************************/
void RunISODATA(double** pSamples, int nNumSamples, PCLASSCLUSTER pCluster, int nNumDim, int* pCenterIndex, int* pCurClusterNum)
{
double dAveTotalCluster; //所有类别的样本到类别中心的平均距离的平均
double** pDistBetTwoClusters= 0;
int EXPECT_CLUSTER_NUM = 0; //预期的类数
int MIN_SAMPLES_ONE_CLUSTER = 0; //每一类中允许的最少样本数(小于此数不可单独成类)
double MIN_CLUSTER_DIST = 0; //两类中心间的最小距离下限(小于此数两类合并)
int ALLOW_MERGE_CLUSTER_NUM = 1; //在每次迭代中可以合并的类的最大对数
double SIGMA_THRELD = 0.0;
int I = 0; //最多迭代次数
int iter = 0; //记录迭代次数
int i,j; //循环变量
pDistBetTwoClusters = (double**)malloc(sizeof(double*)*MAX_CLUSTER_NUM);
for(i=0; i<MAX_CLUSTER_NUM; i++)
{
pDistBetTwoClusters[i] = (double*)malloc(sizeof(double)*MAX_CLUSTER_NUM);
}
start:
//读入参数
printf("\n设定聚类分析控制参数:\n");
printf("预期的类数 c:");
scanf("%d", &EXPECT_CLUSTER_NUM);
#if 0
printf("初始聚类中心个数Nc(可不等于c):");
scanf("%d",&nCurClusterNum);
#endif
printf("每一类中允许的最少样本数目θN(小于此数不可单独成类):");
scanf("%d", &MIN_SAMPLES_ONE_CLUSTER);
printf("类内各分量分布的标准差上限OS(大于此数就分裂):");
scanf("%lf",&SIGMA_THRELD);
printf("两类中心间的最小距离下限OC(小于此数两类合并):");
scanf("%lf", &MIN_CLUSTER_DIST);
printf("在每次迭代中可以合并的类的最多对数L: ");
scanf("%d", &ALLOW_MERGE_CLUSTER_NUM);
printf("最多迭代次数I: ");
scanf("%d", &I);
printf("\n");
step1:
InitClusters(pSamples, nNumSamples, pCluster, *pCurClusterNum, nNumDim, pCenterIndex);
step2: //用最小距离法对全体样本进行聚类
ReClassify(pSamples, nNumSamples, pCluster, *pCurClusterNum, nNumDim);
if(iter==0)
{
printf("\n---------------选取初始聚类中心---------------\n");
}
else
{
printf("-----------------第 %d 次迭代-----------------\n ",iter);
//保存聚类结果
SaveClusters("cluster.txt", (double**)pSamples, nNumSamples, pCluster, *pCurClusterNum, nNumDim);
//保存类别中心
SaveCenters("center.txt", pCluster, *pCurClusterNum, nNumDim);
}
//PrintCluster();
step3://去掉样本数小于N的类别
if(1 == RemoveCenterWithLessNum(pCluster, pCurClusterNum, MIN_SAMPLES_ONE_CLUSTER, nNumDim))
{
goto step2;
}
step4://更新聚类中心
CalcNewClustCenters(pSamples, nNumSamples, pCluster, *pCurClusterNum, nNumDim);
step5:
//每个聚类的样本离开其中心的平均距离以及所有样本离开其相应聚类中心的平均距离
dAveTotalCluster = CalAveDistInCluster(pCluster, *pCurClusterNum, pSamples, nNumSamples, nNumDim);
step6://依据iter,nCurClusterNum判断停止、分裂还是合并
if(iter == I)
{
//MIN_CLUSTER_DIST = 0;
goto step9;
}
if (*pCurClusterNum <= EXPECT_CLUSTER_NUM/2)
{
goto step7;
}
else if(*pCurClusterNum >= 2*EXPECT_CLUSTER_NUM)
{
goto step8;
}
else
{
if(iter%2 == 1)
{
goto step7; //分裂操作
}
else
{
goto step8; //合并操作
}
}
step7: //分裂操作
CalDimSigmaInCluster(pCluster, *pCurClusterNum, pSamples, nNumSamples, nNumDim);
if(1 == DivideClusters(pCluster, pCurClusterNum, nNumDim, dAveTotalCluster, SIGMA_THRELD, MIN_SAMPLES_ONE_CLUSTER, EXPECT_CLUSTER_NUM, MIN_SAMPLES_ONE_CLUSTER))
{
iter++;
goto step2;
}
step8: //合并操作
//计算所有类别中,两两之间的距离
CalAveDistBetween2Centers(pCluster, *pCurClusterNum, nNumDim, pDistBetTwoClusters);
//根据类别间的两两之间的距离,合并一些类别
UnionByLessDistBetwCenter(pCluster, pCurClusterNum, nNumDim, MIN_CLUSTER_DIST,
ALLOW_MERGE_CLUSTER_NUM, pDistBetTwoClusters);
step9:
if(iter >= I) //判断循环还是退出
{
printf("---------------经过 %d 次迭代,达到迭代次数--------------\n",iter);
return;
}
else
{
char ch = 0;
iter++;
printf("本次迭代完成,是否需要改变参数(Y/N)??:\n");
while(!isspace(ch = getchar()));
if(ch == 'y'||ch == 'Y')
{
goto start;
}
else goto step2;
}
//delete memory
for(i=0; i<MAX_CLUSTER_NUM; i++)
{
free(pDistBetTwoClusters[i]);
}
free(pDistBetTwoClusters);
}
#ifdef __cplusplus
}
#endif
main.cpp
#include <stdlib.h>
#include <stdio.h>
#include <io.h>
#include <string.h>
#include "ISODATA.h"
int main(int argc, char *argv[])
{
double** pSamples;
int nNumSamples; // Number of samples
CLASSCLUSTER pCluster[MAX_CLUSTER_NUM];
int nCurClusterNum; // Number of clusters
int pOrgCenterIndex[MAX_CLUSTER_NUM];
int nNumDim; // Number of dimensions in vector
int i = 0;
pSamples = (double**)malloc(sizeof(double*)*MAX_SAMPLES);
for(i=0; i<MAX_SAMPLES; i++)
{
pSamples[i] = (double*)malloc(sizeof(double)*MAX_DIM);
}
char* pFilePath = argv[1];
if (argc<2)
{
printf("usage: intput sample file\n");
exit(0);
}
if (LoadPatterns(pFilePath, (double**)pSamples, &nNumSamples, &nCurClusterNum, &nNumDim, pOrgCenterIndex) == FAILURE)
{
printf("read file %s error\n", pFilePath);
exit(0);
}
//根据文件中的索引,初始化类别中心
InitClusters(pSamples, nNumSamples, pCluster, nCurClusterNum, nNumDim, pOrgCenterIndex);
//运行动态聚类
RunISODATA(pSamples, nNumSamples, pCluster, nNumDim, pOrgCenterIndex, &nCurClusterNum);
//保存聚类结果
SaveClusters("cluster.txt", (double**)pSamples, nNumSamples, pCluster, nCurClusterNum, nNumDim);
//保存类别中心
SaveCenters("center.txt", pCluster, nCurClusterNum, nNumDim);
//delete memory
for(i=0; i<MAX_SAMPLES; i++)
{
free(pSamples[i]);
}
free(pSamples);
}
ps:使用或者转载请标明出处,禁止以商业为目的的使用。
如果有需要word版,或者是pdf版的,请与我联系,QQ:358536026