Bayes Classifier 分类

最新推荐文章于 2024-10-21 03:32:30 发布

最新推荐文章于 2024-10-21 03:32:30 发布 · 258 阅读

文章标签：

#c/c++

本文介绍如何使用C++实现简单的贝叶斯分类算法，并通过一个具体的天气数据集进行演示。文章首先解释了贝叶斯分类的基本原理，包括如何使用正态分布来拟合似然函数。随后给出了完整的C++代码实现细节，包括数据读取、概率计算及类别预测。

Bayes Classifier 分类

在模式识别的实际应用中，贝叶斯方法绝非就是post正比于prior*likelihood这个公式这么简单，一般而言我们都会用正态分布拟合likelihood来实现。

用正态分布拟合是什么意思呢？贝叶斯方法式子的右边有两个量，一个是prior先验概率，这个求起来很简单，就是一大堆数据中求某一类数据占的百分比就可以了，比如300个一堆的数据中A类数据占100个，那么A的先验概率就是1/3。第二个就是likelihood，likelihood可以这么理解：对于每一类的训练数据，我们都用一个multivariate正态分布来拟合它们(即通过求得某一分类训练数据的平均值和协方差矩阵来拟合出一个正态分布)，然后当进入一个新的测试数据之后，就分别求取这个数据点在每个类别的正态分布中的大小，然后用这个值乘以原先的prior便是所要求得的后验概率post了。

C++实现简单贝叶斯分类

所用的数据文件为：weather.csv

outlook	temperature	humidity	windy	play
sunny	hot	high	FALSE	no
sunny	hot	high	TRUE	no
overcast	hot	high	FALSE	yes
rainy	mild	high	FALSE	yes
rainy	cool	normal	FALSE	yes
rainy	cool	normal	TRUE	no
overcast	cool	normal	TRUE	yes
sunny	mild	high	FALSE	no
sunny	cool	normal	FALSE	yes
rainy	mild	normal	FALSE	yes
sunny	mild	normal	TRUE	yes
overcast	mild	high	TRUE	yes
overcast	hot	normal	FALSE	yes
rainy	mild	high	TRUE	no

源代码：

/* 实现简单贝叶斯算法 */ { } { } { } #include <iostream> #include <fstream> #include <string> #include <vector> #include <map> using namespace std; vector<string> split(const string& src,const string& delimiter); //根据定界符分离字符串 void rejudge(); //重新判断原输入数据的类别 vector<vector<string> > vect; //二维容器 map<string,int> category; //存放类别 map<string,double> pro_map; //存放各种概率的map容器 int main() string strLine; ifstream readfile(".\\weather.csv"); if(!readfile) //打开文件失败！ { cout<<"Fail to open file weather!"<<endl; return 0; } else { cout<<"读取原始数据如下:"<<endl; vector<vector<string> >::size_type st_x; //二维容器x坐标 vector<string>::size_type st_y; //二维容器y坐标 vector<string> temp_vect; while(getline(readfile,strLine)) //一行一行读取数据 { cout<<strLine<<endl; temp_vect=split(strLine,","); //调用分割函数分割一行字符串 vect.push_back(temp_vect); //插入二维容器 temp_vect.clear(); //清空容器 } string temp_string; //临时字符串 vector<string>::size_type temp_size1=vect.size(); //总行数 vector<string>::size_type temp_size2=vect[0].size(); //总列数 for(st_x=1;st_x<temp_size1;st_x++) //遍历二维容器，统计各种类别、属性|类别的个数，以便后面的概率的计算(跳过第一行的属性标题) { for(st_y=0;st_y<temp_size2;st_y++) { if(st_y!=temp_size2-1) //处理每一行前面的属性，统计属性|类别的个数 { temp_string=vect[0][st_y]+"="+vect[st_x][st_y]+"|"+vect[0][temp_size2-1]+"="+vect[st_x][temp_size2-1]; pro_map[temp_string]++; //计数加1 } else //处理每一行的类别，统计类别的个数 { temp_string=vect[0][temp_size2-1]+"="+vect[st_x][temp_size2-1]; pro_map[temp_string]++; //计数加1 category[vect[st_x][temp_size2-1]]=1; //还没有类别，则加入新的类别 } temp_string.erase(); } } string::size_type st; cout<<"统计过程如下:"<<endl; for(map<string,double>::iterator it=pro_map.begin();it!=pro_map.end();it++) //计算条件概率（属性|类别） { cout<<it->first<<":"<<it->second<<endl; if((st=it->first.find("|"))!=string::npos) { it->second=it->second/pro_map[it->first.substr(st+1)]; } } cout<<"计算概率过程如下:"<<endl; for(map<string,double>::iterator it2=pro_map.begin();it2!=pro_map.end();it2++) //计算概率（类别） { if((st=it2->first.find("|"))==string::npos) { pro_map[it2->first]=pro_map[it2->first]/(double)temp_size1; } cout<<it2->first<<":"<<it2->second<<endl; } rejudge(); } return 0; vector<string> split(const string& src,const string& delimiter) //根据定界符分离字符串 string::size_type st; if(src.empty()) { throw "Empty string!"; } if(delimiter.empty()) { throw "Empty delimiter!"; } vector<string> vect; string::size_type last_st=0; while((st=src.find_first_of(delimiter,last_st))!=string::npos) { if(st!=last_st) //2个标记间的字符串为一个子字符串 { vect.push_back(src.substr(last_st,st-last_st)); } last_st=st+1; } if(last_st!=src.size()) //标记不为最后一个字符 { vect.push_back(src.substr(last_st,string::npos)); } return vect; void rejudge() //重新判断原输入数据的类别 string temp_string; double temp_pro; map<string,double> temp_map; //存放后验概率的临时容器 cout<<"经过简单贝叶斯算法重新分类的结果如下:"<<endl; for(vector<vector<string> >::size_type st_x=1;st_x<vect.size();st_x++) //处理每一行数据 { for(map<string,int>::iterator it=category.begin();it!=category.end();it++) //遍历类别，取出p(x|c1)和p(x|c2)等的概率值 { temp_pro=1.0; temp_string=vect[0][vect[0].size()-1]+"="+it->first; temp_pro*=pro_map[temp_string]; //乘上p(ci) temp_string.erase(); for(vector<string>::size_type st_y=0;st_y<vect[st_x].size();st_y++) //处理列 { if(it==category.begin()&&st_y!=vect[st_x].size()-1) //不输出原始数据已有的类别，使用预测出来的类别(只输出一次) { cout<<vect[st_x][st_y]<<" "; } if(st_y!=vect[st_x].size()-1) //乘上p(xi|cj),跳过最后一列，因为是类别而非属性 { temp_string=vect[0][st_y]+"="+vect[st_x][st_y]+"|"+vect[0][vect[0].size()-1]+"="+it->first; temp_pro*=pro_map[temp_string]; //乘上p(xi|cj) temp_string.erase(); } } temp_map[it->first]=temp_pro; //存下概率 } //////////根据概率最大判断哪个该条记录应属于哪个类别 string temp_string2; temp_pro=0; //初始化概率为0 cout<<"后验概率:"; for(map<string,double>::iterator it2=temp_map.begin();it2!=temp_map.end();it2++) //遍历容器，找到后验概率最大的类别 { cout<<it2->first<<":"<<it2->second<<" "; if(it2->second>temp_pro) { temp_string2.erase(); temp_string2=it2->first; temp_pro=it2->second; } } cout<<"归类:"<<vect[0][vect[0].size()-1]<<"="<<temp_string2<<endl; //输出该条记录所属的类别 } class="pln">