源代码工程文件(vs2005)http://d.download.youkuaiyun.com/down/1018461/cctt_1
过去在网上找了段代码,发现写的代码要改些地方,而且也想顺便练习下自己的c++编码。
首先我要建立一个真正的树形结构。于是使用了自己过去的GeneralTree.h(当然这里还是改动些GeneralTree的代码例如增添了些函数,另外把有些私有函数变成了公有函数)。
训练文本格式如下:并命名为decision2.txt 并发在自己的工程目录下。当然你也可以改改相关源代码
概念 颜色 形状 轻重
苹果 红 球 一般
苹果 绿 球 一般
香蕉 黄 弯月 一般
草莓 红 球 轻
草莓 绿 球 轻
西瓜 绿 椭球 重
西瓜 绿 球 重
桔子 桔黄 椭球 轻
测试格式文本格式如下:命名为test.txt并放在工程目录下(试试改改源代码)
颜色 形状 轻重
红 球 一般
绿 球 一般
黄 弯月 一般
这里应该考虑各个类分开的。不过为了看起来方便,就合在一起了。
下面是具体代码:
/* created by chico chen * date 2009/02/02 * 如需转载注明出处 */ #include "stdafx.h" #include <iostream> #include <fstream> #include <string> #include <sstream> #include <vector> #include <map> #include <cmath> #include "D:\\Tools\\not Finished\\TreeTest\\TreeTest\\GeneralTree.h" using namespace std; // this class is for computing attribute entropy class AttribDiff { public: string attribName; // 属性名 map<string,int> attribNum; //具体属性和个数对 map<string,map<string,int>> typeNumber; // 第一个string为具体属性名,第二个为类型, // int是类型在具体属性中的个数. // 例如:是否可见 类型 形状 // 1 西瓜 圆 // 1 冬瓜 扁 // 0 橘子 圆 // 其中具体属性为圆,类型为西瓜等个数为形状为圆的类型为西瓜的个数 AttribDiff(string& attribName) { this->attribName = attribName; } // in order to computer entropy of an attribute double AttribDifferComputer(vector<vector<string>> infos,int i_attrib,int i_types, vector<int>& visible) { double probability = 0; double entropy = 0; double attribG = 0; map<string,int> temp; int tempNum = 0; for(int i =0 ; i < infos.size(); i++) { if(visible[i] != 0 ) { tempNum = attribNum[infos[i][i_attrib]]; attribNum[infos[i][i_attrib]] = ++tempNum; temp = typeNumber[infos[i][i_attrib]]; tempNum = temp[infos[i][i_types]]; temp[infos[i][i_types]] = ++tempNum; typeNumber[infos[i][i_attrib]] = temp; } } map<string,int>::iterator i_number; map<string,int>::iterator i_type; for(i_number = attribNum.begin(); i_number != attribNum.end(); i_number++) { probability = (*i_number).second/(double)infos.size(); cout <<(*i_number).first <<"概率为:"<< probability<<endl; entropy = 0; for(i_type = typeNumber[(*i_number).first].begin(); i_type != typeNumber[(*i_number).first].end(); i_type++) { entropy += ComputerEntropyHelp((*i_type).second/(double)(*i_number).second); } attribG += (-1)*probability * entropy; } return attribG; } // compute the entropy double ComputerEntropyHelp(double pi) { return pi*log(pi)/log((double)2); } }; // this class is create a data struct for general tree node class NodeInfo { public: // 颜色 // 红 // 蓝 string attribName; // the attribute name, such as 颜色 vector<string> detailAttrib; // all of detail attributes under one of attribute name, for example, 红 NodeInfo() { attribName = ""; } NodeInfo(string & attribName) { this->attribName = attribName; } NodeInfo(NodeInfo & ni) { this->attribName = ni.attribName; this->detailAttrib = ni.detailAttrib; } // add detail attributes in NodeInfo void NodeDetailInfoAdd(string & detailA) { if(!CheckIsHas(detailA)) { this->detailAttrib.push_back(detailA); } } // If detail attribute is in the detailAttrib list, return true; // else return false; bool CheckIsHas(string & name) { for(int i = 0; i < detailAttrib.size(); i++) { if(strcmp(name.c_str(),detailAttrib[i].c_str()) ==0) { // the same attribute return true; } } return false; } // this is print control for printing NodeInfo static void Print(NodeInfo& info) { cout << info.attribName<< "("; for(int i = 0; i < info.detailAttrib.size() ; i++) { cout << info.detailAttrib[i]<<" "; } cout << ")\n"; } }; // this class is decision tree class DT { protected: const string filename; // the data file name vector<vector<string>> infos; // the array for storing information vector<string> attribs; // the array for storing the attributes GeneralTree<NodeInfo>gt; // the general tree for storing the decision tree const int START; // which column is the start attribute, except the type column const int I_TYPE;// the column index of type const int MAX_ENTROPY; // set an max entropy to find the minimal entropy private: // to help print void PrintHelp(int helpPrint) { for(int i = 0; i < helpPrint; i++) { cout << ".."; } } // to find the index of the attribName in attribs array int Find(string& attribName,vector<string>& attribs) { for(int i = 0; i < attribs.size(); i++) { if(strcmp(attribName.c_str(),attribs[i].c_str()) == 0) { // the same return i; } } return -1; } // this function is used for detecting if the arithmetic is over bool CheckOver(vector<int>& visible,string& type) { map<string,int> types; int temp = 0; for(int i = 0; i < infos.size(); i++) { if(visible[i] != 0) { type = infos[i][I_TYPE]; temp = types[infos[i][I_TYPE]]; if(temp == 0) { types[infos[i][I_TYPE]] = 1; } if(types.size() > 1) { return false; // there are more than one types } } } return true; // there is only one type } // to create a Decision Tree void DTCreate(GeneralTreeNode<NodeInfo> *parent, vector<int> visible,vector<int> visibleA, int i_attrib,string& detailA, int helpPrint) { if(i_attrib >= START) { for(int i = 0; i < infos.size(); i++) { if(strcmp(infos[i][i_attrib].c_str(),detailA.c_str()) !=0) { // not same detail attribute visible[i] = 0; } } } string type = ""; if(CheckOver(visible,type)) { // the arithmetic is over and add the type node into the general tree NodeInfo n(type); GeneralTreeNode<NodeInfo> * node = new GeneralTreeNode<NodeInfo>(n); gt.Insert(node,parent); PrintHelp(helpPrint); cout << "decision type:"<<n.attribName<<endl; return; } map<string,double> attribGs; // this is for deciding which attrib should be used for(int i = START; i < attribs.size(); i++) { // iterate attribs if(visibleA[i] != 0) { AttribDiff ad(attribs[i]); attribGs[attribs[i]] = ad.AttribDifferComputer(infos,i,I_TYPE,visible); cout <<attribs[i] <<"的G为:"<< attribGs[attribs[i]]<<endl; } } // to find the decision attribute double min = MAX_ENTROPY; string attributeName; for(map<string,double>::iterator i = attribGs.begin(); i != attribGs.end(); i++) { if(min >= (*i).second) { attributeName = (*i).first; min = (*i).second; } } NodeInfo n(attributeName); int i_max = Find(attributeName,attribs); for(int i = 0; i<infos.size() ; i++) { n.NodeDetailInfoAdd(infos[i][i_max]); } GeneralTreeNode<NodeInfo> * node = new GeneralTreeNode<NodeInfo>(n); gt.Insert(node,parent); visibleA[i_max] = 0; PrintHelp(helpPrint); cout << "choose attribute:"<< attributeName<<endl; for(int i = 0; i < node->data.detailAttrib.size(); i++) { PrintHelp(helpPrint); cout << "go into the branch:"<<node->data.detailAttrib[i]<<endl; // go to every branch to decision DTCreate(node,visible,visibleA,i_max,node->data.detailAttrib[i],helpPrint+1); } } public: // 要注意的一点是这里的decision2.txt要放在工程目录下。当然如果你愿意可以写绝对路径 // 注意文件的格式: // 首先一列为类别,然后是各个属性 // 例如: 类型 形状 // 西瓜 圆 // 冬瓜 扁 // 橘子 圆 DT():filename("decision2.txt"),START(1),I_TYPE(0),MAX_ENTROPY(10000) { GetInfo(attribs,infos,filename); DTCreate(); } // this function is used for read data from the file // and create the attribute array and all information array // post: attribs has at least one element // infos has at least one element // pre: filename is not empty and the file is exist void GetInfo(vector<string>& attribs,vector<vector<string>>& infos,const string& filename) { ifstream read(filename.c_str()); int start = 0; int end = 0; string info = ""; getline(read,info); istringstream iss(info); string attrib; while(iss >> attrib) { attribs.push_back(attrib); } while(true) { info = ""; getline(read,info); if(info == "" || info.length() <= 1) { break; } vector<string> infoline; istringstream stream(info); while(stream >> attrib) { infoline.push_back(attrib); } infos.push_back(infoline); } read.close(); } // create the DT void DTCreate() { vector<int> visible(infos.size(),1); vector<int> visibleA(attribs.size(),1); // to judge which attribute is useless string temp = ""; DTCreate(NULL,visible,visibleA,START-1,temp,0); } // print the DT void Print() { gt.LevelPrint(NodeInfo::Print); } void Judge(const string& testFilename,vector<string>& types,const string& testResultFileName) { vector<string> attribs_test; vector<vector<string>> infos_test; GetInfo(attribs_test,infos_test,testFilename); if(!CheckFileFormat(attribs_test)) { throw "file format error"; } GeneralTreeNode<NodeInfo> * root = gt.GetRoot(); for(int i = 0; i < infos_test.size(); i++) { types.push_back(JudgeType(root,infos_test[i],attribs_test)); } WriteTestTypesInfo(testResultFileName,types); } void WriteTestTypesInfo(const string& filename, vector<string>& types) { ofstream out(filename.c_str()); out << "类别"<<endl; for(int i = 0 ; i < types.size(); i++) { out << types[i]<<endl; } out.close(); } string JudgeType(GeneralTreeNode<NodeInfo> * node, vector<string>& info,vector<string>& attribs_test) { if(gt.GetChildNodeNum(node) == 0) { return node->getData().attribName; } int index = Find(node->getData().attribName,attribs_test); int branch_index = Find(info[index],node->getData().detailAttrib); if(branch_index == -1) { // is not find this detail attribute in this node detailAttrib // there are two way to deal with this situation // 1. every branch has possibility to choose // 2. no such type and can not judge // the first solution make the correct ratio low // the second solution has no fault-tolerance. // and here I choose the second solution. // if I have more free time later, I will write the first solution throw "no such type"; } GeneralTreeNode<NodeInfo> * childNode = gt.GetAChild(node,branch_index); return JudgeType(childNode, info,attribs_test); } bool CheckFileFormat(vector<string>& attribs_test) { bool isCorrect = true; for(int j = 0; j < attribs_test.size(); j++) { if(Find(attribs_test[j],attribs) == -1) { isCorrect = false; } } if(attribs_test.size() == attribs.size() - 1) { isCorrect = isCorrect && true; } else { isCorrect = false; } return isCorrect; } };
这里的main函数这样写(自己使用的VS2005):
int _tmain(int argc, _TCHAR* argv[]) { DT dt; //dt.Print(); string testFile = "test.txt"; string testResult = "testResult.txt"; vector<string>types; dt.Judge(testFile,types,testResult); return 0; }
自己感觉DT 的注释比较详细,所以在我的blog中就不再做太多的解释。另外这段代码会将测试结果放在工程目录下的testResult.txt中。
另外在控制台上会有生成决策树ID3的相关相关的信息显示,例如:
红概率为:0.25
黄概率为:0.125
桔黄概率为:0.125
绿概率为:0.5
颜色的G为:1
球概率为:0.625
椭球概率为:0.25
弯月概率为:0.125
形状的G为:1.20121
轻概率为:0.375
一般概率为:0.375
重概率为:0.25
轻重的G为:0.688722
choose attribute:轻重
go into the branch:一般
红概率为:0.125
黄概率为:0.125
绿概率为:0.125
颜色的G为:0
球概率为:0.25
弯月概率为:0.125
形状的G为:0
..choose attribute:颜色
..go into the branch:红
....decision type:苹果
..go into the branch:绿
....decision type:苹果
..go into the branch:黄
....decision type:香蕉
..go into the branch:桔黄
....decision type:
go into the branch:轻
红概率为:0.125
桔黄概率为:0.125
绿概率为:0.125
颜色的G为:0
球概率为:0.25
椭球概率为:0.125
形状的G为:0
..choose attribute:颜色
..go into the branch:红
....decision type:草莓
..go into the branch:绿
....decision type:草莓
..go into the branch:黄
....decision type:
..go into the branch:桔黄
....decision type:桔子
go into the branch:重
..decision type:西瓜
这一段信息是什么意思呢?
红概率为:0.25
黄概率为:0.125
桔黄概率为:0.125
绿概率为:0.5
颜色的G为:1
红,黄,桔黄,绿的概率是颜色的具体属性。这里没有把entropy打印出来。如果此段代码被中科院的师弟师妹有幸看到,
你们可以在AttribDifferComputer()函数中添加几行代码就可以把每一个entropy打印出来。反正老师也会让你们看代码,这里就当作作业题吧。(另外老师第十章机器学习ppt上的决策树的这个例子计算结果有错误。如果你认真计算过的话)颜色G的含义是颜色G的决策值,决策值越小,选择此属性的概率就越大。
那决策树是什么样子的呢?
choose attribute:轻重
go into the branch:一般
..choose attribute:颜色
..go into the branch:红
......................
看看上面的这些.这里代表根节点是“轻重”,然后进入“一般”分支,然后进入“一般”分支的节点为颜色..然后进入”红“分支.这里一定要注意”..“,相等的"..”代表树的相同的层次。
做出这个Decision Tree 的ID3代码主要是为了学弟学妹们在考试中测试用的。因为我只是测试了老师ppt中的例子,不保证对于所有的例子都正确。而且老师出的考试题比较变态(属性十个左右)..如果手工计算应该需要一个小时左右的时间。
当初后悔没有先编一个程序。祝各位考试顺利..(我想我这段代码可能会在考试之前被搜到)。
同时提醒大家一点, ID3也不是什么很好的算法。当两个属性的G值一致时,如果它并不能给出一个更好的判断标准。而且如果采用顺序选择很有可能生成一个非最小决策树。这点还值得研究一下。