class Bayes{
private: int CiD_num = 0;//yes类的元组数量
int D_num = 0;//数据集中总的元组数量
vector<double>P_Ci_vector;
vector<vector<string>> datas;//数据
struct attribute_item//属性取值ai
{
string ai_name;//属性取值ai的名称
int Dai_num = 0;//属性A上的取值为ai的元组数
int aici_num = 0;//属性A上取值为ai的元组是yes类的元组数
double P_xk_Ci = 0;P(xk|Ci)
};
struct attribute//属性A
{
string Ai_name;//属性A的名称
bool isStraggling = true;//属性A是否是离散型变量
double P_xk_Ci = 0;//属性是连续型变量时P(xk|Ci)
//vector<string> Ai_value;
map<string, attribute_item> attributes_info;
//属性取值名-存储属性取值的结构体
class average//连续型变量的均值
{
public: double yes_average = 0;//yes类的均值
double no_average = 0;//no类的均值
}average;
class variance//方差
{
public: double yes_variance = 0;
double no_variance = 0;
}variance;
};
vector<attribute>attr_vector;//属性数组
public: void init_data( string filename );//
void statistics( );
string calculate( vector<string>tuple );
friend void print( vector<vector<string >> data, int n, int m );
vector<vector<string>> get_datas( )
{
return datas;
}
};
/*
*function calculate; 预测测试数据分类结果
parameter tuple 测试数据元组
*/
string Bayes::calculate( vector<string> tuple )
{
double P_yes = 1, P_no = 1;
for (int i = 0; i < attr_vector.size()-1; i++)
{
if (attr_vector[i].isStraggling)
{
P_yes *= attr_vector[i].attributes_info[tuple[i]].P_xk_Ci;
P_no *= 1 - attr_vector[i].attributes_info[tuple[i]].P_xk_Ci;
}
else
{
P_yes *= calculate_P( string2double( tuple[i] ), attr_vector[i].average.yes_average, attr_vector[i].variance.yes_variance );
P_no *= calculate_P( string2double( tuple[i] ), attr_vector[i].average.no_average, attr_vector[i].variance.no_variance );
}
}
return P_yes > P_no ? "Yes" : "No";
}
/*
*function statistics; 统计并计算Bayes算法所需要的数据
如CiD_num、D_num 、D中离散型属性Ak的值为xj的Ci类元组数、Ci类训练元组中连续型属性Ak的均值,标准差
*/
void Bayes::statistics( )
{
D_num = datas.size( );
CiD_num = 0;
int attr_num = attr_vector.size( ) - 1;
//离散型,统计数据
for (int i = 0; i < D_num; ++i)
{
if (datas[i][attr_num] == "yes")
CiD_num++;
}
for (int i = 0; i < D_num; ++i)
{
for (int j = 0; j < attr_num; ++j)
{
if (attr_vector[j].isStraggling)
{
attr_vector[j].attributes_info[datas[i][j]].Dai_num++;
datas[i][attr_num] == "yes" ? attr_vector[j].attributes_info[datas[i][j]].aici_num++ : 1;
}
}
}
/*离散型,拉普拉斯校准*/
bool flag1 = false;
for (int j = 0; j < attr_num; ++j)
{
if (attr_vector[j].isStraggling)
{
for (auto iterator = attr_vector[j].attributes_info.begin( );
iterator != attr_vector[j].attributes_info.end( );
iterator++)
{
if (iterator->second.aici_num == 0 || iterator->second.aici_num == iterator->second.Dai_num)
{
flag1 = true;
break;
}
}
}
if (flag1)
break;
}
if (flag1)
{
for (int j = 0; j < attr_num; ++j){
for (auto iterator = attr_vector[j].attributes_info.begin( );
iterator != attr_vector[j].attributes_info.end( );
iterator++)
{
iterator->second.aici_num++;
iterator->second.Dai_num+=2;
CiD_num++;
D_num+=2;
}
}
}
P_Ci_vector.push_back( CiD_num*1.0 / (1.0*D_num) );
P_Ci_vector.push_back( 1 - P_Ci_vector[0] );
for (int i = 0; i < attr_num; ++i)
{
if (attr_vector[i].isStraggling){
for (auto iter = attr_vector[i].attributes_info.begin( );
iter != attr_vector[i].attributes_info.end( );
iter++)
{
iter->second.P_xk_Ci = iter->second.aici_num * 1.0 / (iter->second.Dai_num * 1.0);
}
}
}
//连续型变量,假设服从正态分布
for (int j = 0; j < attr_num; j++)
{
if (!attr_vector[j].isStraggling)
{
vector<double> yes_value;
vector<double> no_value;
for (int i = 0; i < datas.size(); i++)
{
if (datas[i][attr_num] == "yes")
yes_value.push_back( string2double( datas[i][j] ) );
else
no_value.push_back( string2double( datas[i][j] ) );
}
attr_vector[j].average.yes_average = calculate_average( yes_value );
attr_vector[j].average.no_average = calculate_average( no_value );
attr_vector[j].variance.no_variance = calculate_variance( no_value );
attr_vector[j].variance.yes_variance = calculate_variance( yes_value );
}
}
}
/*
*function init_data;从文件读取数据
包括属性信息、所有数据元组
*parameter string filename ;文件名
*/
void Bayes::init_data( string filename )
{
int attribute_num = 0;
string data_line;
vector<string>works;
ifstream read( filename );
getline( read, data_line );
getline( read, data_line );
while (true)
{
getline( read, data_line );
works = get_word( data_line );
if (works.empty( ))
{
break;
}
if (works[0] == "attribute")
{
attribute attr_temp;
attribute_item attr_item_temp;
attr_temp.Ai_name = works[1];
for (int i = 2; i < works.size( ); ++i)
{
if (works[i] == "real")
{
attr_temp.isStraggling = false;
break;
}
attr_temp.isStraggling = true;
attr_item_temp.ai_name = works[i];
attr_temp.attributes_info[works[i]] = attr_item_temp;
}
attr_vector.push_back( attr_temp );
}
}
works.clear( );
while (getline( read, data_line ) && !data_line.empty( ))
{
works = get_word( data_line );
if (works[0] == "data")
{
continue;
}
datas.push_back( works );
}
}
/*
*function string2double;string 到 double的装换
*parameter string str ;待转换的字符串
*return num ;得到的double类型数据
*/
double string2double( const string str )
{
istringstream iss( str );
double num;
iss >> num;
return num;
}
double calculate_average( vector<double> value )
{
int num = value.size( );
double sum = 0.0;
for (int i = 0; i < num; ++i)
{
sum += value[i];
}
return sum / (num *1.0);
}
double calculate_variance( vector<double> value )
{
double variance = 0;
double aver_temp = calculate_average( value );
for (int i = 0; i < value.size( ); i++)
{
variance += pow( aver_temp - value[i], 2.0 );
}
variance /= (value.size( )*1.0);
return sqrt( variance );
}
double calculate_P( double x, double average, double variance )
{
double temp = 1.0 / sqrt( 2 * M_PI ) / variance *(pow( M_E, (-(pow( x - average, 2 ) / 2.0 / pow( variance, 2 ))) ));
return temp;
}
/*
*function get_word 分离字符串中的单词;
*parameter str 待处理的字符串
*return vector<string> 存放分离出的字符串数组
*/
vector<string> get_word( string str )
{
char step[] = " ,@{}";// 分隔符
vector<string>words;
words.clear( );
char *str_temp = ( char* ) str.c_str( );
char *buf = NULL;
char *word_temp = strtok_s( str_temp, step, &buf );
while (word_temp != NULL)
{
string temp = word_temp;
words.push_back( temp );
word_temp = strtok_s( NULL, step, &buf );
}
return words;
}
#include "Bayes.h"
int main( )
{
string filename1 = "weather.txt";
Bayes bayes;
bayes.init_data( filename1 );
bayes.statistics( );
cout << "trainning data :\n";
print( bayes.get_datas(),14,5 );
vector <string>data_test;
string data_line;
string filename2 = "test.txt";
ifstream read( filename2 );
cout << "test data :\n";
while (getline( read, data_line )){
cout<< " "<<data_line<<" answer: ";
cout<< bayes.calculate( get_word( data_line ) )<<endl;
}
string str;
cout << "press any key to exit!" << endl;
cin >> str;
return 0;
}
void print( vector<vector<string >> data, int n, int m ){
for (int i = 0; i < n; i++)
{
cout << " ";
for (int j = 0; j < m; j++)
cout<< data[i][j] << " , ";
cout << endl;
}
}
Bayes分类算法
最新推荐文章于 2024-10-13 16:02:34 发布