Candidate-Elimination算法是数据挖掘中的一种概念学习算法,部分解决Find-S的不足,可以输出所有与训练样本一致的概念,同时利用概念间偏序关系来指导搜索,其伪代码描述如下
- Initialize Gto the set of most-general hypotheses in H
- Initialize Sto the set of most-specific hypotheses in H
- For each training example, d, do:
- If dis a positive example then:
- Remove from Gany hypotheses that do not match d
- For each hypothesis sin Sthat does not match d
- Remove sfrom S
- Add to S all minimal generalizations, h, of ssuch that:
- 1) h matches d
- 2) some member of Gis more general than h
- Remove fromSany hthat is more general than another hypothesis in S
- If dis a negative example then:
- Remove from Sany hypotheses that match d
- For each hypothesis gin Gthat matches d
- Remove gfrom G
- Add to G all minimal specializations, h, of gsuch that:
- 1) hdoes not match d
- 2) some member of Sis more specific than h
- Remove fromG any hthat is more specific than another hypothesis in G
花了几个小时的时间总算把这个算法用C++实现测试通过了,采用了STL中的二维Vector容器保存字符串,在调试部分浪费不少时间。主要是用VC++ 6.0查看STL容器中的变量值不太方便,比如Vector,需要在调式窗口里输入s._first,10 才可以看到全部的数据;也可以加输出的测试代码,但是比较麻烦。好了,废话不多说了,贴上代码,希望各位批评指正。
【概念挖掘需求】
基本的算法思想是,泛化边界初始化为全?,特化边界特化为全空集(@),
1、若遇到正实例,首先从G中删除不包含该实例的概念,然后对S删除与实例不相符的概念,同时做最小泛化,使其包含该实例
2、若遇到负实例,首先从S中删除包含了该实例的概念,然后对G删除包含了该实例的概念,同时做最小特化,注意最小特化集是与当前S一一枚举出可能的特化概念,删除那些符合该实例的概念
C++源码
- #include <iostream>
- #include <string>
- #include <vector>
- using namespace std;
- #define MAXLEN 7//输入每行的数据个数
- //每行为标号 outlook Temperature Humidity Wind PlayTennis Swiming
- //采用二维动态数组Vector保存处理
- vector <vector <string> > state;//实例集
- vector <vector <string> > S;//特化边界
- vector <vector <string> > G;//泛化边界
- vector <string> item(MAXLEN);//对应一行实例集
- string yes("Yes");
- string all("?");
- string white("@");//@表示空集
- string end("end");//输入结束
- //判定概念h是否包含实例d
- bool checkInclude(vector <string> h, vector <string> d){
- bool res = true;
- for(int i = 1; i < MAXLEN-1; i++){
- if(!h[i].compare(all) || !h[i].compare(d[i])) continue;
- else if(!h[i].compare(white) || h[i].compare(d[i])) res = false;
- else {
- cerr<<"error in checkInclude"<<endl;
- res = false;
- }
- }
- return res;
- }
- //对概念S[offset]做最小泛化,使得S[offset]包含d
- void doMinGeneral(int offset, vector <string> d){
- for(int i = 1; i < MAXLEN-1; i++){
- if(!S[offset][i].compare(white)) {
- S[offset][i] = d[i];
- }
- else if(S[offset][i].compare(d[i])) S[offset][i] = all;
- else continue;//包括相等和h[i]为all的情况
- }
- }
- //对概念G[offset]做最小特化,使得G[offset]包不含d,注意最小特化来自于和S的组合
- void doMinSpecific(int offset, vector <string> d, vector <string> s){
- for(int i = 1; i < MAXLEN-1; i++){
- if(G[offset][i].compare(s[i]) && !G[offset][i].compare(all)){
- if(s[i].compare(d[i])){
- //产生新的泛化边界,注意不唯一
- vector<string> temp (G[offset]);//先拷贝一份G[offset]
- temp[i] = s[i];
- G.push_back(temp);
- }
- }
- if(G[offset][i].compare(s[i]) && G[offset][i].compare(all)){
- cerr<<"doMinSpecific: error in data"<<endl;//G[offset][i]不为?且其与s[i]不同的情况
- }
- }
- }
- void input(){
- int i, j;
- string s;
- while(cin>>s,s.compare(end) != 0){//-1为输入结束
- item[0] = s;
- for( i = 1;i < MAXLEN; i++){
- cin>>item[i];
- }
- state.push_back(item);
- }
- //初始化
- for( i = 0;i < MAXLEN; i++){
- item[i] = white;
- }
- S.push_back(item);
- for( i = 0;i < MAXLEN; i++){
- item[i] = all;
- }
- G.push_back(item);
- }
- void output(){
- int i,j;
- cout<<"The specific boundary is:"<<endl;
- for(i = 0; i < S.size(); i++){
- for(j = 1;j < MAXLEN-1; j++){
- cout<<S[i][j]<<" ";
- }
- cout<<endl;
- }
- cout<<endl;
- cout<<"The general boundary is:"<<endl;
- for(i = 0; i < G.size(); i++){
- for(j = 1;j < MAXLEN-1; j++){
- cout<<G[i][j]<<" ";
- }
- cout<<endl;
- }
- cout<<endl;
- }
- void solve(){
- int i,j;
- for( i = 0; i < state.size(); i++){
- if(!state[i][MAXLEN-1].compare(yes)){
- //正实例的情况,首先从G中删除不包含该实例的概念,
- //然后对S删除与实例不相符的概念,同时做最小泛化,使其包含该实例
- for(j = 0; j < G.size(); j++){
- if(!checkInclude(G[j],state[i])) {
- G.erase(G.begin() + j);
- }
- }
- for(j = 0; j < S.size(); j++){
- doMinGeneral(j,state[i]);
- }
- }
- else {//负实例的情况,首先从S中删除包含了该实例的概念,
- //然后对G做最小特化,与当前S一一枚举出可能的特化概念,删除那些符合该实例的概念
- for(j = 0; j < S.size(); j++){
- if(checkInclude(S[j],state[i])){
- S.erase(S.begin() + j);
- }
- }
- int orginGSize = G.size();//先记下G原始大小
- for(j = 0; j < orginGSize; j++){
- doMinSpecific(j,state[i],S[0]);
- }
- G.erase(G.begin(), G.begin() + (orginGSize));//删除G[0]到G[orginGSize-1],但是要注意括号内的参数不同
- }
- }
- }
- int main(){
- input();
- solve();
- output();
- return 0;
- }
测试数据一
- 0 Sunny Warm High Strong Warm Yes
- 1 Sunny Warm Normal Strong Warm No
- 2 Rainy Cold High Strong Warm No
- 3 Sunny Warm High Strong Cool Yes
- end
测试结果一
- The specific boundary is:
- Sunny Warm High Strong ?
- The general boundary is:
- Sunny ? High ? ?
- ? Warm High ? ?
测试数据二
- 0 Sunny Warm High Strong Warm Yes
- 1 Sunny Warm Normal Strong Warm No
- 2 Rainy Cold High Strong Warm No
- 3 Sunny Warm High Strong Cool Yes
- end
- The specific boundary is:
- Sunny Warm High Strong ?
- The general boundary is:
- Sunny ? High ? ?
- ? Warm High ? ?
具体预测的方法是
1、如果给定的实例符合概念集合S,则一定去游泳
2、如果给定的实例不符合概念集合G,则一定不去游泳
3、如果给定的实例不符合概念集合S,但是符合概念集合G,则可能去游泳