DM&ML_note.2.1-ID3决策树

最新推荐文章于 2024-10-11 19:47:17 发布

原创最新推荐文章于 2024-10-11 19:47:17 发布 · 993 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#数据挖掘 #ID3决策树

DM-ML 专栏收录该内容

11 篇文章

订阅专栏

本文详细介绍了一个决策树算法的实现过程，包括前置知识要求、具体实现代码及注释说明。涉及决策树构建、熵与信息增益计算等核心概念。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

这个学期要学DM&ML，用的是《数据挖掘算法原理与实现》王振武本着造福同学的思想，开一个DM&ML的笔记系列，打算给书上的源代码添加一点注释，方便阅读和理解。

前置知识要求
勘误
具体实现
感想

前置知识要求：

C++，STL，树，深度优先搜索（DFS）
一点点数学（换底公式）

勘误！！！：

书上P71的图输出的运行结果并非提供的样例数据的结果，实际结果请用数据跑来看看。
在void Input()函数 287行里，源代码使用的是全局变量end来传递参数，但是在我的VS2013的环境下会被报错，说end指向不明确，遂改为直接传入”end”。

具体实现：

/*hiro：本程序保留了书中源代码和注释，自行添加的代码和注释都有hiro:字样进行标识
我添加的注释仅供参考。*/
#include <iostream>  
#include <fstream>/*hiro:添加了文件读入*/
#include <string>  
#include <vector>  
#include <map>  
#include <algorithm>  
#include <cmath>  
using namespace std;  
#define MAXLEN 6//输入每行的数据个数  

//多叉树的实现   /*hiro:是说常用的实现方式，本程序只使用5*/
//1 广义表  
//2 父指针表示法，适于经常找父结点的应用  
//3 子女链表示法，适于经常找子结点的应用  
//4 左长子，右兄弟表示法,实现比较麻烦  
//5 每个结点的所有孩子用vector保存  
//教训:数据结构的设计很重要，本算法采用5比较合适，同时  
//注意维护剩余样例和剩余属性信息，建树时横向遍历考循环属性的值，  
//纵向遍历靠递归调用  

vector <vector <string> > state;//实例集  
vector <string> item(MAXLEN);//对应一行实例集  
vector <string> attribute_row;//保存首行即属性行数据  
string end("end");//输入结束  
string yes("yes");  
string no("no");  
string blank("");  
map<string,vector < string > > map_attribute_values;//存储属性对应的所有的值  
int tree_size = 0;  
struct Node{//决策树节点  
    string attribute;//属性值  
    string arrived_value;//到达的属性值  
    vector<Node *> childs;//所有的孩子  
    Node(){  /*C++ 的无参构造函数，顺带一提析构函数是结构体/类名+~，本例中为~Node()*/
        attribute = blank;  
        arrived_value = blank;  
    }  
};  
Node * root;  

//根据数据实例计算属性与值组成的map  
void ComputeMapFrom2DVector(){  
    /*hiro:这段代码其实就是遍历输入的数据，提取出每一个属性下含有的值并存到map中
    比如outlook属性下有sunny,overcast,rainy3个属性，
    于是map_attribute_value就有键值对｛first:outlook,second:{sunny,overcast,rainy}｝*/
    unsigned int i,j,k;  
    bool exited = false;  
    vector<string> values;  
    for(i = 1; i < MAXLEN-1; i++){//按照列遍历  
        for (j = 1; j < state.size(); j++){  
            for (k = 0; k < values.size(); k++){  
                if(!values[k].compare(state[j][i])) exited = true;  
            }  
            if(!exited){  
                values.push_back(state[j][i]);//注意Vector的插入都是从前面插入的，注意更新it，始终指向vector头  
            }  
            exited = false;  
        }  
        map_attribute_values[state[0][i]] = values;  
        values.erase(values.begin(), values.end());  /*hiro:上面说的注意更新it(iterator 迭代器)指的是这里吧？*/
    }     
}  
/*hiro:
参数value：某个属性下的具体的值，比如outlook的sunny;
参数ifparent：意如其名，在算entropy(X)的时候为true,算entropy(X,outlook)为false*/
//根据具体属性和值来计算熵  
double ComputeEntropy(vector <vector <string> > remain_state, string attribute, string value,bool ifparent){  
    /*hiro:这里是vector的构造函数，表示生成一个长度为2，内容全为0的vector对象*/
    vector<int> count (2,0);  
    unsigned int i,j;  
    bool done_flag = false;//哨兵值  
    /*hiro:这里第一个for是用来寻找attribute对应的下标的，
    ...............既然如此为何不用FindAttriNumByName,,,写都写了居然不用，，，
    还非得要做个标记，，，*/
    for(j = 1; j < MAXLEN; j++){  
        if(done_flag) break;  
        if(!attribute_row[j].compare(attribute)){  
            for(i = 1; i < remain_state.size(); i++){  
                if((!ifparent&&!remain_state[i][j].compare(value)) || ifparent){//ifparent记录是否算父节点  
                    /*统计yes/no*/
                    if(!remain_state[i][MAXLEN - 1].compare(yes)){  
                        count[0]++;  
                    }  
                    else count[1]++;  
                }  
            }  
            done_flag = true;  
        }  
    }  
    if(count[0] == 0 || count[1] == 0 ) return 0;//全部是正实例或者负实例  
    //具体计算熵 根据[+count[0],-count[1]],log2为底通过换底公式换成自然数底数  
    /*hiro：换底公式https://www.baidu.com/s?wd=%E6%8D%A2%E5%BA%95%E5%85%AC%E5%BC%8F*/
    double sum = count[0] + count[1];  
    double entropy = -(count[0]/sum)*(log(count[0]/sum)/log(2.0)) - (count[1]/sum)*(log(count[1]/sum)/log(2.0));  
    return entropy;  
}  

//计算按照属性attribute划分当前剩余实例的信息增益  
double ComputeGain(vector <vector <string> > remain_state, string attribute){  
    unsigned int j,k,m;  
    //首先求不做划分时的熵  

    double parent_entropy = ComputeEntropy(remain_state, attribute, blank, true);  
    /*hiro: ↓↓↓ 中间输出entropy(X),方便调试*/
    cout << " entropy(" << attribute << ")=" << parent_entropy << endl;
    double children_entropy = 0;  
    //然后求做划分后各个值的熵  
    vector<string> values = map_attribute_values[attribute];  
    vector<double> ratio;  /*hiro:保存每个特定的值占这个属性的比例*/
    vector<int> count_values;  
    int tempint;  
    /*hiro:统计每个值在remain_state的出现次数*/
    for(m = 0; m < values.size(); m++){  
        tempint = 0;  
        for (k = 1; k < MAXLEN - 1; k++){  /*hiro:还是那个问题，为何就不用FindAttriNumByName呢，哭QAQ*/
            if(!attribute_row[k].compare(attribute)){  
                for(j = 1; j < remain_state.size(); j++){  
                    if(!remain_state[j][k].compare(values[m])){  
                        tempint++;  
                    }  
                }  
            }  
        }  
        count_values.push_back(tempint);  
    }  

    for(j = 0; j < values.size(); j++){  
        /*hiro：求每一个值在对应属性占的比例，比如sunny=5/14*/
        ratio.push_back((double)count_values[j] / (double)(remain_state.size()-1));  
    }  
    double temp_entropy;  
    for(j = 0; j < values.size(); j++){  
        temp_entropy = ComputeEntropy(remain_state, attribute, values[j], false);  
        /*hiro: ↓↓↓ 中间输出entropy(X),方便调试*/
        cout << " entropy(" << values[j] << ")=" << temp_entropy << endl;
        children_entropy += ratio[j] * temp_entropy;  
    }  
    return (parent_entropy - children_entropy);   
}  

/*hiro:功能如函数名描述*/
int FindAttriNumByName(string attri){  
    for(int i = 0; i < MAXLEN; i++){  
        if(!state[0][i].compare(attri)) return i;  
    }  
    cerr<<"can't find the numth of attribute"<<endl;   
    return 0;  
}  

//找出样例中占多数的正/负性  
/*hiro:本例中正负性以yes/no衡量*/
string MostCommonLabel(vector <vector <string> > remain_state){  
    int p = 0, n = 0;  
    for(unsigned i = 0; i < remain_state.size(); i++){  
        if(!remain_state[i][MAXLEN-1].compare(yes)) p++;  
        else n++;  
    }  
    if(p >= n) return yes;  
    else return no;  
}  

//判断样例是否正负性都为label  
bool AllTheSameLabel(vector <vector <string> > remain_state, string label){  
    int count = 0;  
    for(unsigned int i = 0; i < remain_state.size(); i++){  
        if(!remain_state[i][MAXLEN-1].compare(label)) count++;  
    }  
    if(count == remain_state.size()-1) return true;  
    else return false;  
}  

/*hiro:本实例用的是dfs建树，忘记了dfs（深度优先搜索）和树的请出门复习一下再回来
这里提供个dfs伪码的框架：
void    dfs(node){

    递归出口：
    if（满足条件）
        return node->child=生成的叶子结点；

    递归建树：
    for each i as child in node{
        dfs(node->child[i]);
    }
    return node;
}

*/
//计算信息增益，DFS构建决策树  
//current_node为当前的节点  
//remain_state为剩余待分类的样例  
//remian_attribute为剩余还没有考虑的属性  
//返回根结点指针  
Node * BulidDecisionTreeDFS(Node * p, vector <vector <string> > remain_state, vector <string> remain_attribute){  
    //if(remain_state.size() > 0){  
        //printv(remain_state);  
    //}  


    /*hiro:下面4个if为递归出口*/
    /*hiro:树根时满足条件*/
    if (p == NULL)  
        p = new Node();  
    //先看搜索到树叶的情况  
    /*hiro:这个AllTheSameLabel函数是用来检查剩下的待分类样例中是不是全为yes/no，如果是则成为叶子节点*/
    if (AllTheSameLabel(remain_state, yes)){  
        p->attribute = yes;  
        return p;  
    }  
    if (AllTheSameLabel(remain_state, no)){  
         p->attribute = no;  
        return p;  
    }  
    if(remain_attribute.size() == 0){//所有的属性均已经考虑完了,还没有分尽  
        string label = MostCommonLabel(remain_state);  
        p->attribute = label;  
        return p;  
    }  

    double max_gain = 0, temp_gain;  
    vector <string>::iterator max_it = remain_attribute.begin();  
    vector <string>::iterator it1;  
    for(it1 = remain_attribute.begin(); it1 < remain_attribute.end(); it1++){  
        /*hiro：计算每个剩余属性的gain值，并求出gain最大的属性来给下面划分样例（生成不同的子节点）*/
        temp_gain = ComputeGain(remain_state, (*it1));  
        /*hiro: ↓↓↓ 添加中间输出gain,方便调试*/
        cout << endl;
        cout << "gain(" << (*it1) << ")=" << temp_gain << endl;
        cout << endl;
        if(temp_gain > max_gain) {   
            max_gain = temp_gain;  
            max_it = it1;  
        }  
    }  
    //下面根据max_it指向的属性来划分当前样例，更新样例集和属性集  
    vector <string> new_attribute;  
    vector <vector <string> > new_state;  
    for(vector <string>::iterator it2 = remain_attribute.begin(); it2 < remain_attribute.end(); it2++){  
        /*hiro:迭代器相当于指向这个内容的指针，只不过加了一些推广的功能，以便对不同的数据结构都可以
        使用统一的接口（比如这里经典的数组遍历形式）来访问数据
        所以（*it2）指remain_attribute里被遍历的对象，提取出remain_attribute中
        不具有max_it指向的属性值的项，比如第一次算所有gain的时候，outlook最大，所以new_attribute里保存的就是
        temperature, humidity, wind, play tennis
        */
        if((*it2).compare(*max_it)) 
            new_attribute.push_back(*it2);  
    }  
    //确定了最佳划分属性，注意保存  
    p->attribute = *max_it;  
    vector <string> values = map_attribute_values[*max_it];  
    /*hiro:FindAttriNumByName 一个小的辅助函数，就是字面的意思，找到max_it指向的属性对应的二维数组的
    下标，方便后序操作*/
    int attribue_num = FindAttriNumByName(*max_it);  
    new_state.push_back(attribute_row);  /*hiro:保存被某个属性划分后的实例集*/
    for(vector <string>::iterator it3 = values.begin(); it3 < values.end(); it3++){  
        for(unsigned int i = 1; i < remain_state.size(); i++){  
            /*hiro:提取符合某个分类的实例，比如outlook为sunny的所有项*/
            if(!remain_state[i][attribue_num].compare(*it3)){  
                new_state.push_back(remain_state[i]);  
            }  
        }  
        /*hiro:为这个分类建一个结点，比如outlook结点下，属于sunny分类的新的结点【注意此时还不知道新的结点的属性，需要递归求】*/
        Node * new_node = new Node();  
        new_node->arrived_value = *it3;  
        if(new_state.size() == 0){//表示当前没有这个分支的样例，当前的new_node为叶子节点  
            new_node->attribute = MostCommonLabel(remain_state);  
        }  
        else   
            BulidDecisionTreeDFS(new_node, new_state, new_attribute);  
        //递归函数返回时即回溯时需要:
        //1 将新结点加入父节点孩子容器 
        p->childs.push_back(new_node);  
        //2 清除new_state容器  
        new_state.erase(new_state.begin()+1,new_state.end());//注意先清空new_state中的前一个取值的样例，准备遍历下一个取值样例  
    }  
    return p;  
}  

void Input(){  
    string s;  
    /*hiro:增加文本读入，并将原来的cin改成fin，方便调试*/
    ifstream fin;//hiro:增加代码！
    fin.open("input.txt");//hiro:增加代码！
    /*hiro:在C语言中，多个表达式可以用逗号分开，其中用逗号分开的表达式的值分别结算，但整个表达式的值是最后一个表达式的值。
    所以可以判断结束，（拓展：用&&的话要注意逻辑短路）*/
    /*hiro:s.compare(end)改为s.compare("end")  */
    while(fin/*hiro:原本为cin*/>>s,s.compare("end") != 0){//-1为输入结束  
        item[0] = s;  
        for(int i = 1;i < MAXLEN; i++){  
            fin/*hiro:原本为cin*/>>item[i];  
        }  
        state.push_back(item);//注意首行信息也输入进去，即属性  
    }  
    for(int j = 0; j < MAXLEN; j++){  
        attribute_row.push_back(state[0][j]);  
    }  
}  

/*hiro:我觉得可以不用看，与算法本身无关，不感兴趣如何树状字符输出的可以跳过
其实主要是dfs先序遍历+利用child链输出合理的tab个数*/
void PrintTree(Node *p, int depth){  
    for (int i = 0; i < depth; i++) cout << '\t';//按照树的深度先输出tab  
    if(!p->arrived_value.empty()){  
        cout<<p->arrived_value<<endl;  
        for (int i = 0; i < depth+1; i++) cout << '\t';//按照树的深度先输出tab  
    }  
    cout<<p->attribute<<endl;  
    for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++){  
        PrintTree(*it, depth + 1);  
    }  
}  
/*hiro:dfs递归，后序遍历删除结点，同样与算法无关，可以不看*/
void FreeTree(Node *p){  
    /*hiro:由于建树的时候叶子结点的child并没有填NULL，所以下面的if貌似永远不会执行*/
    if (p == NULL)  
        return;  
    /*hiro:取而代之的，递归出口是在 it != p->childs.end()，当访问叶子结点的时候child的size=0，
    因此break然后delete这个结点*/
    for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++){  
        FreeTree(*it);  
    }  
    delete p;  
    /*hiro:统计结点数*/
    tree_size++;  
}  

int main(){  
    Input();  
    vector <string> remain_attribute;  

    /*hiro：应该是因为C++的内存机制问题，需要新建一个字符串对象，占有了内存空间（堆）以后
    才能push进去，不然直接push一个常量字符串"Outlook"的话会导致这项内容只读，毕竟大家存的地方
    不一样*/
    string outlook("Outlook");  
    string Temperature("Temperature");  
    string Humidity("Humidity");  
    string Wind("Wind");  
    remain_attribute.push_back(outlook);  
    remain_attribute.push_back(Temperature);  
    remain_attribute.push_back(Humidity);  
    remain_attribute.push_back(Wind);  
    vector <vector <string> > remain_state;  
    for(unsigned int i = 0; i < state.size(); i++){  
        remain_state.push_back(state[i]);   
    }  
    ComputeMapFrom2DVector();  
    root = BulidDecisionTreeDFS(root,remain_state,remain_attribute);  
    cout<<"the decision tree is :"<<endl;  
    PrintTree(root,0);  
    FreeTree(root);  
    cout<<endl;  
    cout<<"tree_size:"<<tree_size<<endl;  
    return 0;  
}

感想：

……………………………………………..
这次的代码跟上次的不是同一个人写的吧！！！！是作者带的研究生们分别写的吧！！！！【不负责任的推测】
好吧感觉这份代码比上一份的要好一点，不过也是有相对少的看起来奇奇怪怪的地方（已经在注释中提及）。
其实我觉得这个决策树算法应该挺好用的，用来分析数据间的关系，估计很多统计软件都有实现。
不过需要注意的是：相关性≠因果关系
再次希望书上给的样例数据，能够把程序的每一个地方，有意义地跑完，不然靠脑补效果还是没有单步调试看得到效果的好。这次的数据里就没有“所有的属性均已经考虑完了,还没有分尽”的相关情况，导致带有label这个单词的函数大部分没有起作用。【虽然函数功能已经很明显】
还有一个比较有趣的公式P57的4-1，可以求编码的比特数，不知道这个比特数是不是满足哈夫曼树编码出来的最优条件，如果是的话，在熵也是的是个interesting的度量公式。
还有个小疑问，其实gain可以不用算出来吧，把entropy(S,outlook)的最小值求出来其实也一样了，嘛，反正都算了entropy(S)了，多一步减而已。
其实整个算法的时间我觉得主要是耗费在查表上，一些简单的步骤省略也在注释中提供。