数据挖掘—概念学习Candidate-Elimination算法的C++实现

最新推荐文章于 2021-02-27 22:41:05 发布

转载最新推荐文章于 2021-02-27 22:41:05 发布 · 586 阅读

大数据专栏收录该内容

9 篇文章

订阅专栏

本文介绍了一种概念学习算法——候选消除算法，并详细展示了其在C++中的实现过程及测试案例。该算法能够输出与训练样本一致的所有概念，适用于概念挖掘任务。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Candidate-Elimination算法是数据挖掘中的一种概念学习算法，部分解决Find-S的不足，可以输出所有与训练样本一致的概念，同时利用概念间偏序关系来指导搜索，其伪代码描述如下

[cpp]view plaincopy 
   
 Initialize Gto the set of most-general hypotheses in H  
 Initialize Sto the set of most-specific hypotheses in H  
 For each training example, d, do:  
 If dis a positive example then:  
 Remove from Gany hypotheses that do not match d  
 For each hypothesis sin Sthat does not match d  
 Remove sfrom S  
 Add to S all minimal generalizations, h, of ssuch that:  
 1) h matches d  
 2) some member of Gis more general than h  
 Remove fromSany hthat is more general than another hypothesis in S  
 If dis a negative example then:  
 Remove from Sany hypotheses that match d  
 For each hypothesis gin Gthat matches d  
 Remove gfrom G  
 Add to G all minimal specializations, h, of gsuch that:  
 1) hdoes not match d  
 2) some member of Sis more specific than h  
 Remove fromG any hthat is more specific than another hypothesis in G  

花了几个小时的时间总算把这个算法用C++实现测试通过了，采用了STL中的二维Vector容器保存字符串，在调试部分浪费不少时间。主要是用VC++ 6.0查看STL容器中的变量值不太方便，比如Vector，需要在调式窗口里输入s._first,10 才可以看到全部的数据；也可以加输出的测试代码，但是比较麻烦。好了，废话不多说了，贴上代码，希望各位批评指正。

【概念挖掘需求】

基本的算法思想是，泛化边界初始化为全？，特化边界特化为全空集(@),

1、若遇到正实例,首先从G中删除不包含该实例的概念，然后对S删除与实例不相符的概念，同时做最小泛化，使其包含该实例

2、若遇到负实例,首先从S中删除包含了该实例的概念，然后对G删除包含了该实例的概念，同时做最小特化，注意最小特化集是与当前S一一枚举出可能的特化概念，删除那些符合该实例的概念

C++源码

[cpp]view plaincopy 
   
 #include <iostream>  
 #include <string>  
 #include <vector>  
 using namespace std;  
 #define MAXLEN 7//输入每行的数据个数  
   
 //每行为标号 outlook Temperature Humidity Wind PlayTennis Swiming  
 //采用二维动态数组Vector保存处理  
 vector <vector <string> > state;//实例集  
 vector <vector <string> >  S;//特化边界  
 vector <vector <string> >  G;//泛化边界  
 vector <string> item(MAXLEN);//对应一行实例集  
 string yes("Yes");  
 string all("?");  
 string white("@");//@表示空集  
 string end("end");//输入结束  
   
 //判定概念h是否包含实例d  
 bool checkInclude(vector <string> h, vector <string> d){  
     bool res = true;  
     for(int i = 1; i < MAXLEN-1; i++){  
         if(!h[i].compare(all) || !h[i].compare(d[i])) continue;  
         else if(!h[i].compare(white) || h[i].compare(d[i])) res = false;  
         else {  
             cerr<<"error in checkInclude"<<endl;  
             res = false;  
         }     
     }  
     return res;  
 }  
   
 //对概念S[offset]做最小泛化，使得S[offset]包含d  
 void doMinGeneral(int offset, vector <string> d){  
     for(int i = 1; i < MAXLEN-1; i++){  
         if(!S[offset][i].compare(white)) {  
             S[offset][i] = d[i];  
         }  
         else if(S[offset][i].compare(d[i])) S[offset][i] = all;  
         else continue;//包括相等和h[i]为all的情况  
     }  
 }  
   
 //对概念G[offset]做最小特化，使得G[offset]包不含d，注意最小特化来自于和S的组合  
 void doMinSpecific(int offset, vector <string> d, vector <string> s){  
     for(int i = 1; i < MAXLEN-1; i++){  
         if(G[offset][i].compare(s[i]) && !G[offset][i].compare(all)){  
             if(s[i].compare(d[i])){  
                 //产生新的泛化边界，注意不唯一  
                 vector<string> temp (G[offset]);//先拷贝一份G[offset]  
                 temp[i] = s[i];  
                 G.push_back(temp);  
             }  
         }  
         if(G[offset][i].compare(s[i]) && G[offset][i].compare(all)){  
             cerr<<"doMinSpecific: error in data"<<endl;//G[offset][i]不为？且其与s[i]不同的情况  
         }  
     }  
 }  
   
 void input(){  
     int i, j;  
     string s;  
     while(cin>>s,s.compare(end) != 0){//-1为输入结束  
         item[0] = s;  
         for( i = 1;i < MAXLEN; i++){  
             cin>>item[i];  
         }  
         state.push_back(item);  
     }  
   
     //初始化  
     for( i = 0;i < MAXLEN; i++){  
             item[i] = white;  
     }  
     S.push_back(item);  
     for( i = 0;i < MAXLEN; i++){  
             item[i] = all;  
     }  
     G.push_back(item);  
 }  
   
 void output(){  
     int i,j;  
     cout<<"The specific boundary is:"<<endl;  
     for(i = 0; i < S.size(); i++){  
         for(j = 1;j < MAXLEN-1; j++){  
             cout<<S[i][j]<<" ";  
         }  
         cout<<endl;  
     }  
     cout<<endl;  
     cout<<"The general boundary is:"<<endl;  
     for(i = 0; i < G.size(); i++){  
         for(j = 1;j < MAXLEN-1; j++){  
             cout<<G[i][j]<<" ";  
         }  
         cout<<endl;  
     }  
     cout<<endl;  
 }  
   
 void solve(){  
     int i,j;  
     for( i = 0; i < state.size(); i++){  
         if(!state[i][MAXLEN-1].compare(yes)){  
             //正实例的情况,首先从G中删除不包含该实例的概念，  
             //然后对S删除与实例不相符的概念，同时做最小泛化，使其包含该实例  
             for(j = 0; j < G.size(); j++){  
                 if(!checkInclude(G[j],state[i])) {  
                     G.erase(G.begin() + j);  
                 }  
             }  
             for(j = 0; j < S.size(); j++){  
                 doMinGeneral(j,state[i]);  
             }  
         }  
         else {//负实例的情况,首先从S中删除包含了该实例的概念，  
               //然后对G做最小特化，与当前S一一枚举出可能的特化概念，删除那些符合该实例的概念  
             for(j = 0; j < S.size(); j++){  
                 if(checkInclude(S[j],state[i])){  
                     S.erase(S.begin() + j);  
                 }  
             }  
             int orginGSize = G.size();//先记下G原始大小  
             for(j = 0; j < orginGSize; j++){  
                 doMinSpecific(j,state[i],S[0]);  
             }     
             G.erase(G.begin(), G.begin() + (orginGSize));//删除G[0]到G[orginGSize-1],但是要注意括号内的参数不同  
         }  
     }  
 }  
   
 int main(){  
     input();  
     solve();  
     output();  
     return 0;  
 }  

测试数据一

[plain]view plaincopy 
   
 0 Sunny Warm High Strong Warm Yes  
 1 Sunny Warm Normal Strong Warm No  
 2 Rainy Cold High Strong Warm No  
 3 Sunny Warm High Strong Cool Yes  
 end  

测试结果一

[plain]view plaincopy 
   
 The specific boundary is:  
 Sunny Warm High Strong ?  
   
 The general boundary is:  
 Sunny ? High ? ?  
 ? Warm High ? ?  

测试数据二

[plain]view plaincopy 
   
 0 Sunny Warm High Strong Warm Yes  
 1 Sunny Warm Normal Strong Warm No  
 2 Rainy Cold High Strong Warm No  
 3 Sunny Warm High Strong Cool Yes  
 end  

测试结果二

[plain]view plaincopy 
   
 The specific boundary is:  
 Sunny Warm High Strong ?  
   
 The general boundary is:  
 Sunny ? High ? ?  
 ? Warm High ? ?