2010年的一道题,做了2天。。。对文件操作的处理,还是不够熟练。还得多熟悉才行,不然心里没底。
从文本文件里读取训练样本的方法始终没有想出好的解决办法,目前很笨,先读取有多少行,就是说有多少个训练样本,再从头一行一行的读取到数组里。
KNN的描述大致为:
在一个文本文件里,每一行保存一个训练样本,有N_DIM个属性值,用空格分开,最后是一个分类,用“+”或“-”来表示。例如:
1.2 3.2 6.6 +
3.3 5.6 1.9 -
2.7 3.9 5.2 +
。。。
待分类实例也有N_DIM个属性,分别与训练样本计算出距离值SIM,然后选出SIM值最小的前K个训练样本来,以这K个训练样本分类数多的作为待分类实例的分类值。
SIM值是按向量的余弦值来计算,公式为sim=SUM(V1*V2) / [sqrt(V1*V1)*sqrt(V2*V2)]。
在 KNN.h头文件里定义常量和相关读取数据、计算的函数:
#ifndef __HJDSAMPLE_INCLUDE__
#define __HJDSAMPLE_INCLUDE__
#define N_DIM 3 //每行3个属性值
#define N_KNN 3 //取最近的3个训练样本
//定义训练样本的结构
typedef struct SampleStruct
{
double rProperties[N_DIM];
char cType;
double rSim; //距离
} SampleStructType;
//对训练样本数组进行快速排序,以便找到距离最小的前k个训练样本
void SampleStructType_qsort(SampleStructType ar_samples[], int nS, int nE)
{
if(nS<nE)
{
double rKey = ar_samples[nS].rSim;
//printf("\nqsort(%d, %d).\n", nS, nE);
int i=nS, j=nE;
while(i<j)
{
if(ar_samples[j].rSim<rKey)
{
if(ar_samples[i].rSim>rKey)
{
SampleStructType_swap(&ar_samples[i], &ar_samples[j], N_DIM);
j--;
i++;
}
else
{
i++;
}
}
else
{
j--;
}
}
int nMid =j;
//printf("\nSuccess move round.\tnMid=%d\n", nMid);
SampleStructType_swap(&ar_samples[nS], &ar_samples[j], N_DIM);
//
SampleStructType_qsort(ar_samples, nS, nMid);
//
SampleStructType_qsort(ar_samples, nMid+1, nE);
}
}
//这里有点笨,先扫描一次训练样本的文本文件,得到有多少个训练样本
int getLineCount(const char * filename)
{
int nLineCount=0;
char ch;
FILE *fp=fopen(filename, "r");
if(NULL!=fp)
{
while((ch=fgetc(fp))!=EOF)
{
//printf("%c", ch);
if(ch=='\n')
{
nLineCount++;
}
}
}
else
{
nLineCount=0;
}
fclose(fp);
return(nLineCount);
}
//读取训练样本
int hjd_readSamples(const char * filename, SampleStructType *ar_samples, int nSamplesCount)
{
int nScanRes = 0;
FILE *fp=fopen(filename, "r");
int nPos = 0;
char ch='\0';
if(NULL==fp)
{
nScanRes =-1;//cannot open file
}
else
{
for(nPos=0; nPos<nSamplesCount; nPos++)
{
for(int ndim=0; ndim<N_DIM; ndim++)
{
nScanRes = fscanf(fp, "%lf ", &ar_samples[nPos].rProperties[ndim]);
}
ch = fgetc(fp);//read catalog
ar_samples[nPos].cType = ch;
}
nScanRes = nSamplesCount;
}
fclose(fp);
return(nScanRes);
}
//读取待分类的实例
int hjd_readInstance(const char * filename, SampleStructType * objInstance)
{
int nScanRes = 1;
FILE *fp = fopen(filename, "r");
//char ch='\0';
if(NULL==fp)
{
nScanRes =-1;//cannot open file
}
else
{
for(int ndim=0; ndim<N_DIM; ndim++)
{
nScanRes = fscanf(fp, "%lf ", &(objInstance->rProperties)[ndim]);
}
}
fclose(fp);
return(nScanRes);
}
//计算两个向量的积
double vector_multiple(SampleStructType v1, SampleStructType v2, int nDIM)
{
double rsum=0.0;
for(int ndpos=0; ndpos<nDIM; ndpos++)
{
rsum = rsum + v1.rProperties[ndpos]*v2.rProperties[ndpos];
}
return(rsum);
}
double vector_abs(SampleStructType v1, int nDIM)
{
double rsum=0.0;
for(int ndpos=0; ndpos<nDIM; ndpos++)
{
rsum = rsum + v1.rProperties[ndpos]*v1.rProperties[ndpos];
}
rsum = sqrt(rsum);
return(rsum);
}
//为了实现快速排序要用到的向量的交换
void SampleStructType_swap(SampleStructType *s1, SampleStructType *s2, int nDIM)
{
SampleStructType tmp;
for(int nppos=0; nppos<nDIM; nppos++)
{
tmp.rProperties[nppos] = (s1->rProperties[nppos]);
}
tmp.cType = s1->cType;
tmp.rSim = s1->rSim;
for(int nppos=0; nppos<nDIM; nppos++)
{
s1->rProperties[nppos] = (s2->rProperties[nppos]);
}
s1->cType = s2->cType;
s1->rSim = s2->rSim;
for(int nppos=0; nppos<nDIM; nppos++)
{
s2->rProperties[nppos] = (tmp.rProperties[nppos]);
}
s2->cType = tmp.cType;
s2->rSim = tmp.rSim;
}
//计算与训练样本的距离值
void knn_sim(SampleStructType ar_samples[], int nSampleCount, SampleStructType * objInstance)
{
double rAbs2= vector_abs(*objInstance, N_DIM);
//printf("\nSuccess calculate rAbs2=%lf.", rAbs2);
for(int ncount=0; ncount<nSampleCount; ncount++)
{
double rSim = vector_multiple(ar_samples[ncount], (*objInstance), N_DIM);
double rAbs1 = vector_abs(ar_samples[ncount], N_DIM);
rSim = rSim/(rAbs1*rAbs2);
ar_samples[ncount].rSim = rSim;
}
//after calculate every SIM value, sort the samples
printf("\nSuccess calculate each rSim.\n");
SampleStructType_qsort(ar_samples, 0, (nSampleCount-1));
printf("\nSuccess sort the samples.\n");
//count
int nP=0, nM=0;
for(int npos=0; npos<N_KNN; npos++)
{
if(ar_samples[npos].cType=='+')
nP++;
else
nM++;
}
if(nP>nM)
objInstance->cType='+';
else
objInstance->cType='-';
}
#endif
在主程序里调用方法就可以了,knn_app.c的内容为:
/*--------------------------------------------------------*/
/* Implement a K-Nearest-N Learning algrithom */
/*--------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <math.h>
#include <sys/time.h>
#include "hjdsample.h"
int main(void)
{
int nSamplesCount = getLineCount("./samples.txt"); //记录有多少条训练样本
SampleStructType ar_samples[nSamplesCount], objInstance;
if((hjd_readSamples("./samples.txt", ar_samples, nSamplesCount)>0)&&(hjd_readInstance("./instances.txt", &objInstance)>0))
{
for(int nPos=0; nPos<nSamplesCount; nPos++)
{
for(int ndim=0; ndim<N_DIM; ndim++)
{
printf("%lf ", ar_samples[nPos].rProperties[ndim]);
}
printf("%c\n", ar_samples[nPos].cType);
}
printf("\nthe instance is:\n");
for(int ndim=0; ndim<N_DIM; ndim++)
{
printf("%lf ", objInstance.rProperties[ndim]);
}
objInstance.cType='\0';
}
printf("\nStart to calculate.\n");
knn_sim(ar_samples, nSamplesCount, &objInstance);
printf("objInstance.cType=%c\n", objInstance.cType);
return(0);
}
感觉真难呀,要在40分钟内写完这么多代码,我几乎做不到啊。。。