Bayes Classifier by George

本文介绍了一个基于C++实现的朴素贝叶斯分类器,详细展示了从加载训练数据到预测过程的具体步骤。该分类器能处理文本分类任务,通过使用boost库解析训练文件,并实现了向量化表示和概率计算等功能。

train.txt format: label\tword word …

/*************************************************************************
 > File Name: bayesNB.cpp
 > Author: test
 ************************************************************************/
#include<iostream>
#include<unordered_map>
#include<fstream>
#include<vector>
#include<string.h>
#include <boost/algorithm/string.hpp>
#include <assert.h>
#include<time.h>
#include<math.h>
#include<algorithm>
using namespace std;

//note: asure that each line of train file more than two words, (class and one word).
void loadData(const string src_file, vector<vector<string> > &data, vector<string> &classVec){
    fstream fin;
    fin.open(src_file.c_str());
    string line;
    vector<string> words, lineVec;
    int count=0;
    while(!fin.eof()) {
        if(getline(fin, line)) {
            //the format of train file, each line as: class w1 w2 w3...
            words = boost::split(words, line, boost::is_any_of(" \t"));
            //make sure each line has more than class one word
            assert (words.size()>=2);
            for (auto iter=words.begin()+1; iter != words.end(); ++iter) {
                lineVec.push_back(*iter);
            }
            data.push_back(lineVec);
            classVec.push_back(words[0]);
            count++;
            words.clear();
            lineVec.clear();
        }
    }
    fin.close();
}

void getDictFromData(vector<vector<string> > data, unordered_map<string, float> &vcab, vector<string> &ordered_vcab){
    for (auto vec_iter=data.begin(); vec_iter != data.end(); ++vec_iter) {
        for (auto iter=(*vec_iter).begin(); iter != (*vec_iter).end(); ++iter) {
            if (*iter != "")    vcab.insert({*iter, 0});
        }
    }
    for (auto map_iter=vcab.begin(); map_iter != vcab.end(); ++map_iter) ordered_vcab.push_back(map_iter->first);
}

void getMat(vector<vector<string> > data, unordered_map<string, float> vcab, vector<vector<float> > &trainMat, vector<string> ordered_vcab, bool isTest) {
    unordered_map<string, float> tmp_map;
    for (auto map_iter=vcab.begin(); map_iter != vcab.end(); ++map_iter) tmp_map.insert({map_iter->first, 0.0});
    const size_t vcab_size = ordered_vcab.size();
    //Vec represents a vec corresponds to the dictionary for each line, as: [0, 1, ....].
    float appear;
    vector<float> vecMat;   
    for (auto vec_iter=data.begin(); vec_iter != data.end(); ++vec_iter) {
        for (auto str_iter=(*vec_iter).begin(); str_iter != (*vec_iter).end(); ++str_iter) {
            if ((*str_iter) != "") {
            auto got = tmp_map.find(*str_iter);
            if (!isTest) assert (!(got == tmp_map.end()));
            if (got==tmp_map.end()) continue;
            tmp_map[*str_iter] = 1.0;
            }
        }
        for (auto vcab_iter=ordered_vcab.begin(); vcab_iter != ordered_vcab.end(); ++vcab_iter) {
            appear = tmp_map[*vcab_iter];
            vecMat.push_back(appear);
        }
        trainMat.push_back(vecMat);
        //erase vecMat and set tmp_map->second to zeros
        for (auto str_iter=(*vec_iter).begin(); str_iter != (*vec_iter).end(); ++str_iter) tmp_map[*str_iter] = 0.0;
        vecMat.clear();
    }
}

//probs mapping class to vec 
//prob mapping class to the probability
void getClassifyVec(vector<string> classVec, vector<vector<float> >trainMat, 
        unordered_map<string, vector<float> > &probs, unordered_map<string, float> &prob) {
    assert(classVec.size()==trainMat.size());
    assert(classVec.size()>0);
    const size_t nb_train = classVec.size();
    const size_t nb_dict = trainMat[0].size();
    vector<float> fvec;
    for (unsigned int i=0; i<nb_train; ++i) {
        if (probs.find(classVec[i]) == probs.end()) {
            prob.insert({classVec[i], 2});//the time start by 1, not 0, to avoid 0 times.   
            for (auto iter=trainMat[i].begin(); iter != trainMat[i].end(); ++iter) fvec.push_back(*iter+1.0);
            probs.insert({classVec[i], fvec});
            fvec.clear();
        }else{
            prob[classVec[i]] += 1;
            for (unsigned int j=0; j<nb_dict; ++j) probs[classVec[i]][j] += trainMat[i][j];
        }
    }
    //normalize the prob
    float total = 0.0;
    for (auto iter=prob.begin(); iter != prob.end(); ++iter) total += iter->second;
    for (auto iter=prob.begin(); iter != prob.end(); ++iter) iter->second /= total;
    total = 0.0;
    //normalize the probs
    for (auto vec_iter=probs.begin(); vec_iter != probs.end(); ++vec_iter) {
        for (auto iter=(vec_iter->second).begin(); iter != (vec_iter->second).end(); ++iter) total += *iter;
        for (auto iter=(vec_iter->second).begin(); iter != (vec_iter->second).end(); ++iter) *iter/=total;
        total = 0.0;
    }
}

//predictMat represents the probability matrix corresponding to the classes
//because the testMat is organised by the ordered_vcab
//classes introduced by classVec by clear the repeated elements
void predict(vector<vector<float> >testMat, vector<vector<float> > &predictMat, vector<string> &classes, vector<string> classVec,
        unordered_map<string, vector<float> > probs, unordered_map<string, float> prob) {
    //we introduce the log to avoid too small number 
    //also it gives a beautiful form
    //init classes by classVec
    for (auto iter=classVec.begin(); iter != classVec.end(); ++iter) classes.push_back(*iter);
    sort(classes.begin(), classes.end());
    auto iter=unique(classes.begin(), classes.end());
    classes.erase(iter, classes.end());
    vector<float> predict;
    float pclass;
    float p=0.0;
    for (unsigned int i=0; i<testMat.size(); ++i) {
        for (auto iter=classes.begin(); iter != classes.end(); ++iter) {
            pclass=prob[*iter];
            assert(probs[*iter].size()==testMat[i].size());
            for (int j=0; j<probs[*iter].size(); ++j) p+=log(probs[*iter][j])*testMat[i][j];
            p += log(pclass);
            predict.push_back(p);
            p=0.0;
        }
        predictMat.push_back(predict);
        predict.clear();
    }
}

void predictClass(vector<vector<float> > predictMat, vector<string> classes,
        vector<string> &predictClasses) {
    size_t index=0;
    float maxEle;
    for (auto iter=predictMat.begin(); iter != predictMat.end(); ++iter) {
        maxEle=(*iter)[0];
        for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1) 
            maxEle = maxEle<(*iter1)?(*iter1):maxEle;   
        for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1) {
            if (maxEle==(*iter1)) break;
            index += 1;
        }
        predictClasses.push_back(classes[index]);   
        index = 0;
    }
}

int main(){
    time_t start, end;
    time(&start);
    string src_file = "train1";
    string test_file = "val1";
    vector<vector<string> > data, testData;
    vector<vector<float> > trainMat, testMat, predictMat;
    vector<string> classVec, testClassVec, classes, predictClasses;
    vector<string> ordered_vcab;
    unordered_map<string, float> vcab;
    unordered_map<string, vector<float> > probs;
    unordered_map<string, float> prob;
    loadData(src_file, data, classVec);
    getDictFromData(data, vcab, ordered_vcab);
    /*
    for (auto iter=data.begin(); iter != data.end(); ++iter) {
        for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1) {
            cout<<*iter1<<" ";
        }
        cout<<"\n";
    }
    */
    //for (auto iter=ordered_vcab.begin(); iter != ordered_vcab.end(); ++iter) cout<<*iter<<" ";
    //cout<<ordered_vcab.size()<<"\n";
    getMat(data, vcab, trainMat, ordered_vcab, 0); 
    /*
    for (auto iter=trainMat.begin(); iter != trainMat.end(); ++iter) {
        for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1) {
            cout<<*iter1<<" ";
        }
        cout<<"\n";
    }
    */
    getClassifyVec(classVec, trainMat, probs, prob);
    /*
    for (auto iter=probs.begin(); iter != probs.end(); ++iter) {
        cout<<iter->first<<"\n";
        auto tmp = &(iter->second);
        for (auto iter1=(*tmp).begin(); iter1 != (*tmp).end(); ++iter1) {
            cout<<*iter1<<" ";
        }
        cout<<"\n";
    }
    */
    loadData(test_file, testData, testClassVec);
    getMat(testData, vcab, testMat, ordered_vcab, 1);
    predict(testMat, predictMat, classes, classVec, probs, prob); 
    predictClass(predictMat, classes, predictClasses);
    /*
    for (auto iter=predictMat.begin(); iter != predictMat.end(); ++iter) {
        for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1) {
            cout<<*iter1<<" ";
        }
        cout<<"\n";
    }
    */
    //for (auto iter=predictClasses.begin(); iter !=  predictClasses.end(); ++iter) cout<<*iter<<"\n";
    time(&end);
    cout<<"running time: "<<difftime(end, start)<<"\n";
    return 0;
}
乐播投屏是一款简单好用、功能强大的专业投屏软件,支持手机投屏电视、手机投电脑、电脑投电视等多种投屏方式。 多端兼容与跨网投屏:支持手机、平板、电脑等多种设备之间的自由组合投屏,且无需连接 WiFi,通过跨屏技术打破网络限制,扫一扫即可投屏。 广泛的应用支持:支持 10000+APP 投屏,包括综合视频、网盘与浏览器、美韩剧、斗鱼、虎牙等直播平台,还能将央视、湖南卫视等各大卫视的直播内容一键投屏。 高清流畅投屏体验:腾讯独家智能音画调校技术,支持 4K 高清画质、240Hz 超高帧率,低延迟不卡顿,能为用户提供更高清、流畅的视觉享受。 会议办公功能强大:拥有全球唯一的 “超级投屏空间”,扫码即投,无需安装。支持多人共享投屏、远程协作批注,PPT、Excel、视频等文件都能流畅展示,还具备企业级安全加密,保障会议资料不泄露。 多人互动功能:支持多人投屏,邀请好友加入投屏互动,远程也可加入。同时具备一屏多显、语音互动功能,支持多人连麦,实时语音交流。 文件支持全面:支持 PPT、PDF、Word、Excel 等办公文件,以及视频、图片等多种类型文件的投屏,还支持网盘直投,无需下载和转格式。 特色功能丰富:投屏时可同步录制投屏画面,部分版本还支持通过触控屏或电视端外接鼠标反控电脑,以及在投屏过程中用画笔实时标注等功能。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值