朴素贝叶斯法是基于贝叶斯定理与特征条件独立假设的分类方法。
学习的目标就是根据给定样本x,计算类别y;
1.学习目标为:f(x) = argmax p(y|x)最大化后验概率
2.再根据贝叶斯定理,
p(y|x) = p(x|y)p(y)/p(x)
3.p(x)对于特定样本是不变的,所以
p(y|x) = p(x|y)p(y)。
4.所以学习的过程就是计算
先验概率:p(y)
条件概率:p(x|y)
5.分类的时候取后验概率最大,相当于期望损失最小化。下面是证明过程
优点:对于小样本数据表现很好,适合增量式训练,适合多分类
缺点:对数据的输入形式很敏感
c++源代码
#include <iostream>
#include <set>
#include <vector>
using namespace std;
//定义训练数据
#define M 3
#define N 15
//为了计算简单,对A2={S, M, L},令S=1, M=2, L=3;
//Y={1, -1},令为Y={1, 2}
int A[M][N]= {
{1 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3},
{1 , 2 , 2 , 1 , 1 , 1 , 2 , 2 , 3 , 3 , 3 , 2 , 2 , 3 , 3},
{2 , 2 , 1 , 1 , 2 , 2 , 2 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2}
};
struct Px1
{
int x1;
int y;
double p_x1y;
};
struct Px2
{
int x2;
int y;
double p_x2y;
};
double p[2];
Px1 px1[6];
Px2 px2[6];
//计算先验概率和条件概率
void calP()
{
//计算先验
//double p[2];
int i, j, k;
multiset<int> m_x1, m_x2, m_y;//多重集容器
multiset<int>::iterator pos1;
set<int> x1, x2, y;//集合容器
set<int>::iterator pos2, pos3;
//运用多重集容器和集合容器
for(i = 0; i < N; i++)
{
m_x1.insert(A[0][i]);
m_x2.insert(A[1][i]);
m_y.insert(A[2][i]);
x1.insert(A[0][i]);
x2.insert(A[1][i]);
y.insert(A[2][i]);
}
p[0] = m_y.count(1) / (double)N; //p(Y = 1)
p[1] = m_y.count(2) / (double)N; //p(Y = 2)
cout << endl << "************先验***********" << endl;
cout << "p(Y = 1) = " << p[0] << endl;
cout << "p(Y = 2) = " << p[1] << endl;
//计算条件概率
cout << endl;
cout << "*********条件概率********" << endl;
// int px1_num = 3 * 2;
// int px2_num = 3 * 2;
j=0;
for(pos2 = y.begin(); pos2 != y.end(); pos2++)
{
for(pos3 = x1.begin(); pos3 != x1.end(); pos3++)
{
px1[j].y = *pos2;
px1[j].x1 = *pos3;
int count_x1y = 0;
for(k = 0; k < N; k++)
{
if(A[0][k] == px1[j].x1 && A[2][k] == px1[j].y)
count_x1y++;
}
px1[j].p_x1y = count_x1y / (double)m_y.count(px1[j].y);//计算p(x1 | y)的概率
j++;
}
}
cout << "p(x1 | y):" << endl;
for(j = 0; j < 6; j++)
{
cout << px1[j].x1 << " " << px1[j].y << " " << px1[j].p_x1y << endl;
}
j=0;
for(pos2 = y.begin(); pos2 != y.end(); pos2++)
{
for(pos3 = x2.begin(); pos3 != x2.end(); pos3++)
{
px2[j].y = *pos2;
px2[j].x2 = *pos3;
int count_x2y = 0;
for(k = 0; k < N; k++)
{
if(A[1][k] == px2[j].x2 && A[2][k] == px2[j].y)
count_x2y++;
}
px2[j].p_x2y = count_x2y / (double)m_y.count(px2[j].y);//计算p(x2 | y)的概率
j++;
}
}
cout << "p(x2 | y):" << endl;
for(j = 0; j < 6; j++)
{
cout << px2[j].x2 << " " << px2[j].y << " " << px2[j].p_x2y << endl;
}
}
int main()
{
int i = 0, j = 0;
//输出训练数据
cout << "***********训练数据************" << endl;
for(i = 0; i < M; i++)
{
if(i == 0) cout << "X1: ";
else if(i == 1) cout << "X2: ";
else if(i == 2) cout << " Y: ";
for(int j = 0; j < N; j++)
{
cout << " "<< A[i][j];
}
cout << endl;
}
calP();//计算先验和条件概率
int s_x1, s_x2;
double result[2];
int class_y = 1;
cout << "*************预测***************" << endl;
cout << endl << endl << "Input:";
cin >> s_x1 >> s_x2;
for(i = 0; i < 2; i++)
{
double s_px_1, s_px_2;
for(j = 0; j < 6; j++)
{
if(s_x1 == px1[j].x1 && px1[j].y == class_y)
s_px_1 = px1[j].p_x1y;
if(s_x2 == px2[j].x2 && px2[j].y == class_y)
s_px_2 = px2[j].p_x2y;
}
result[i] = p[i] * s_px_1 * s_px_2;
class_y++;
}
cout << endl << "all results:";
cout << result[0] << " " << result[1] << endl;
class_y = 0;
for(i = 0; i <2; i++)
{
if(result[i] < result[i+1])
{
class_y = i+1;
}
}
cout << "("<< s_x1 << "," << s_x2 << ")所属的类是:" << class_y + 1 << endl;
return 0;
}
代码里用的是极大似然估计,但是可能存在概率为0的时候,这样可以使用贝叶斯估计。贝叶斯估计的时候lambda = 1,就是拉普拉斯平滑。
参考
http://blog.youkuaiyun.com/idmer/article/details/48809677