#include"stdafx.h"
#include "stdlib.h"
#include<iostream>
#include<string>
#include<vector>
#include<map>
#include<algorithm>
#include<cmath>
#include<fstream>
using namespace std;
#define MAXLEN 6//定义的属性个数
ifstream fin("data.txt");
ofstream fout("result.txt");
//自定义的决策树节点
struct Node
{
string attribute;//属性值
string arrived_value;//到达的属性值
vector<Node *> childs;//所有的分差节点,vector装入
Node()
{
// 初始化的时候均设为空值
attribute = "";
arrived_value = "";
}
}*root;
vector <vector <string> > state;//保存整个实例集
vector <string> item(MAXLEN);//对应实例集中的某一行
vector <string> attribute_row;//保存首行即属性行数据
string attrName[MAXLEN] = { "RID", "model", "perfume", "price", "income", "buyaction" };
map<string, vector <string> > map_attribute_values;//存储属性对应的所有的值
int tree_size = 0;
// 将数据以文件流的形式输入
void DataInit()
{
string s;
int i, j;
while (fin >> s, s.compare("end") != 0)
{
//当输入结束时s.compare("end")的值为-1
item[0] = s; //第一个值保存的是RID
for (int i = 1; i < MAXLEN; i++)
{
fin >> item[i];
}
state.push_back(item);
}
for (j = 0; j < MAXLEN; j++)
{
attribute_row.push_back(attrName[j]); // 初始化属性栏值
}
fin.close();
}
// log2为底通过换底公式换成自然数底数
double lg2(double n)
{
return log(n) / log(2);
}
//根据数据实例存储属性对应的所有的值
void MapAttributeValue()
{
unsigned int i, j, k;
bool exited = false;
vector<string> values;
//按照列遍历(每个属性对应一列)
for (i = 1; i < MAXLEN - 1; i++)
{
for (j = 0; j < state.size(); j++)
{
for (k = 0; k < values.size(); k++)
{
//如果该属性值已经存在则退出循环
if (!values[k].compare(state[j][i]))
{
exited = true;
break;
}
}
//如果循环后的exited值仍为false说明该属性值并未出现过
//将其加入到暂时存放的向量中
if (!exited)
{
values.push_back(state[j][i]);
}
exited = false; //注意此处重置值
}
map_attribute_values[attrName[i]] = values; //相对应的放入
values.clear(); //为了存放下一次的属性值清空
}
}
//根据具体属性和值来计算熵
double Entropy(vector <vector<string> > remain_state, string attribute, string value, bool ifparent)
{
vector<int> count(2, 0);
int i, j;
bool flag = false;
for (j = 1; j < MAXLEN; j++)
{
if (flag) break;
if (!attribute_row[j].compare(attribute))
{
for (i = 1; i < remain_state.size(); i++)
{
if ((!ifparent&&!remain_state[i][j].compare(value)) || ifparent)
{//ifparent记录是否算父节点
if (!remain_state[i][MAXLEN - 1].compare("yes"))
{
count[0]++; //count[0]记录实例中yes的值
}
else count[1]++;//count[1]记录实例中no的值
}
}
flag = true;
}
}
//全部是正实例或者负实例
if (count[0] == 0 || count[1] == 0) return 0;
//具体计算熵
double sum = count[0] + count[1];
double entropy = -count[0] / sum * lg2((double)count[0] / sum)
- count[1] / sum * lg2((double)count[1] / sum);
return entropy;
}
//计算按照属性attribute划分当前剩余实例的信息增益
double Gain(vector <vector <string> > remain_state, string attribute)
{
unsigned int j, k, m;
//首先求不做划分时的熵,即根的熵值
double parent_entropy = Entropy(remain_state, attribute, "", true);
double children_entropy = 0;
//然后求做划分后各个值的熵
vector<string> values = map_attribute_values[attribute];
vector<double> ratio;//存放每一个属性值在实例中的期望
vector<int> count_values;
int tempint;//存放出现的次数
for (m = 0; m < values.size(); m++)
{
tempint = 0;
for (k = 1; k < MAXLEN - 1; k++)
{
if (!attribute_row[k].compare(attribute))
{
for (j = 1; j < remain_state.size(); j++)
{
if (!remain_state[j][k].compare(values[m]))
{
tempint++;
}
}
}
}
count_values.push_back(tempint);
}
for (j = 0; j < values.size(); j++)
{
ratio.push_back((double)count_values[j] / (double)(remain_state.size() - 1));
}
double temp_entropy;
for (j = 0; j < values.size(); j++)
{
temp_entropy = Entropy(remain_state, attribute, values[j], false);
children_entropy += ratio[j] * temp_entropy; //权值的累加
}
return (parent_entropy - children_entropy);
}
//根据属性名查找到对应的AttriNum
int FindAttriNumByName(string attri)
{
for (int i = 0; i < MAXLEN; i++)
{
if (!attrName[i].compare(attri)) return i;
}
cerr << "can't find the numth of attribute" << endl;
return 0;
}
//找出样例中占多数的正/负性
string MostCommonLabel(vector <vector <string> > remain_state)
{
int y = 0, n = 0;
int i;
for (i = 0; i < remain_state.size(); i++)
{
if (!remain_state[i][MAXLEN - 1].compare("yes")) y++;
else n++;
}
if (y >= n) return "yes";
else return "no";
}
//若子集只含有单个属性,则分支为叶子节点
//判断样例是否正负性都一致,即是否到达叶子节点
bool IsReachLeaveNode(vector <vector <string> > remain_state, string label)
{
int count = 0, i;
for (i = 0; i < remain_state.size(); i++)
{
if (!remain_state[i][MAXLEN - 1].compare(label)) count++;
}
if (count == remain_state.size() - 1) return true;
else return false;
}
//生成决策树的方法
Node * BulidDecisionTree(Node * p, vector <vector <string> > remain_state,
vector <string> remain_attribute)
{
int i;
if (p == NULL)
p = new Node();
//先看搜索到树叶的情况
if (IsReachLeaveNode(remain_state, "yes"))
{
p->attribute = "yes";
return p;
}
if (IsReachLeaveNode(remain_state, "no"))
{
p->attribute = "no";
return p;
}
//所有的属性均已经考虑完了,还没有分尽
if (remain_attribute.size() == 0)
{
string label = MostCommonLabel(remain_state);
p->attribute = label;
return p;
}
double max_gain = 0; //记录最大的熵值
double temp_gain;
//信息增益算出来不一定大于0
//max_it应该初始化为remain_attribute.begin()
vector <string>::iterator max_it = remain_attribute.begin();
vector <string>::iterator ittmp = remain_attribute.begin();
for (; ittmp < remain_attribute.end(); ittmp++)
{
temp_gain = Gain(remain_state, (*ittmp)); //求出指针对应属性的熵值
if (temp_gain > max_gain)
{//若找到更佳的属性则转化
max_gain = temp_gain;
max_it = ittmp;
}
}
//下面根据max_it指向的属性来划分当前样例,更新样例集和属性集
vector <string> new_attribute;//保存去除最佳属性后的剩余属性
vector <vector <string> > new_state;//保存剩余的实例
vector <string>::iterator it2 = remain_attribute.begin();
for (; it2 < remain_attribute.end(); it2++)
{
if ((*it2).compare(*max_it)) new_attribute.push_back(*it2);
}
//确定了最佳划分属性,注意保存
p->attribute = *max_it;
vector <string> values; //存放最佳属性的属性值
values = map_attribute_values[*max_it];
int attribue_num = FindAttriNumByName(*max_it);
new_state.push_back(attribute_row);//将第一行的属性先包括
vector <string>::iterator it3 = values.begin();
for (; it3 < values.end(); it3++)
{
for (i = 1; i < remain_state.size(); i++)
{
if (!remain_state[i][attribue_num].compare(*it3))
{
new_state.push_back(remain_state[i]);
}
}
Node * new_node = new Node();
new_node->arrived_value = *it3;
if (new_state.size() == 0)
{//表示当前没有这个分支的样例,当前的new_node为叶子节点
new_node->attribute = MostCommonLabel(remain_state);
}
else
BulidDecisionTree(new_node, new_state, new_attribute);
p->childs.push_back(new_node);//将新结点加入父节点孩子容器
//注意先清空new_state中的前一个取值的样例,准备遍历下一个取值样例
new_state.erase(new_state.begin() + 1, new_state.end());
}
return p;
}
//打印树的函数
void PrintTree(Node *p, int depth)
{
for (int i = 0; i < depth; i++)
{
fout << '\t';//按照树的深度先输出tab
cout << '\t';//按照树的深度先输出tab
}
if (!p->arrived_value.empty())
{
fout << p->arrived_value << endl;
cout << p->arrived_value << endl;
for (int i = 0; i < depth + 1; i++)
{
fout << '\t';//按照树的深度先输出tab
cout << '\t';//按照树的深度先输出tab
}
}
fout << p->attribute << endl;
cout << p->attribute << endl;
for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++)
{
PrintTree(*it, depth + 1); //递归
}
}
//对单条数据,输出最后的结果
string TestClassifiter(string test[])
{
string ans = "";
Node* arrived_attibute = root; //初始化为root
bool isFind;
while (true)
{
isFind = false;
if (arrived_attibute->childs.size() == 1) break; //此时说明已经达到叶子节点
vector<Node*>::iterator it = arrived_attibute->childs.begin();
//循环扫描分叉中的值
for (; it != arrived_attibute->childs.end(); it++)
{
for (int k = 0; k < 4; k++)
{
if ((*it)->arrived_value == test[k])
{
isFind = true;
break;
}
}
if (isFind) break;
}
if (!isFind) break;
arrived_attibute = *it;
}
//叶子节点输出
return arrived_attibute->attribute;
}
int main()
{
int i;
DataInit(); //数据初始化
//remain_attribute存放需要划分的属性(一开始是所有属性)
vector <string> remain_attribute;
for (i = 1; i < MAXLEN - 1; i++)
remain_attribute.push_back(attrName[i]);
//remain_state存放实例集(一开始是整个实例集)
vector <vector <string> > remain_state;
for (unsigned int i = 0; i < state.size(); i++)
{
remain_state.push_back(state[i]);
}
MapAttributeValue();//将属性值映射到对应的属性
root = BulidDecisionTree(root, remain_state, remain_attribute);
fout << "the decision tree is :" << endl;
PrintTree(root, 0);
//以下是数据测试
string test[] = { "small", "strong", "expensive", "low" };
cout << "the test result is: " << TestClassifiter(test) << endl;
system("pause");
return 0;
}
data数据如下:
1 small strong expensive high no
2 small strong expensive a_bit_high no
3 small strong expensive common yes
4 small strong expensive low no
5 small strong a_litle_expensive high yes
6 small strong a_litle_expensive a_bit_high no
7 small strong a_litle_expensive common yes
8 small strong a_litle_expensive low no
9 small strong affordable high yes
10 small strong affordable a_bit_high yes
11 small strong affordable common yes
12 small strong affordable low no
13 small strong substantial high yes
14 small strong substantial a_bit_high no
15 small strong substantial common no
16 small strong substantial low no
17 small strong cheap high no
18 small strong cheap a_bit_high no
19 small strong cheap common no
20 small strong cheap low no
21 small elegant expensive high no
22 small elegant expensive a_bit_high no
23 small elegant expensive common yes
24 small elegant expensive low no
25 small elegant a_litle_expensive high no
26 small elegant a_litle_expensive a_bit_high yes
27 small elegant a_litle_expensive common no
28 small elegant a_litle_expensive low no
29 small elegant affordable high yes
30 small elegant affordable a_bit_high yes
31 small elegant affordable common no
32 small elegant affordable low no
33 small elegant substantial high yes
34 small elegant substantial a_bit_high yes
35 small elegant substantial common no
36 small elegant substantial low no
37 small elegant cheap high yes
38 small elegant cheap a_bit_high yes
39 small elegant cheap common no
40 small elegant cheap low no
41 small slight expensive high no
42 small slight expensive a_bit_high yes
43 small slight expensive common no
44 small slight expensive low no
45 small slight a_litle_expensive high yes
46 small slight a_litle_expensive a_bit_high no
47 small slight a_litle_expensive common yes
48 small slight a_litle_expensive low no
49 small slight affordable high no
50 small slight affordable a_bit_high no
51 small slight affordable common no
52 small slight affordable low no
53 small slight substantial high yes
54 small slight substantial a_bit_high no
55 small slight substantial common no
56 small slight substantial low no
57 small slight cheap high yes
58 small slight cheap a_bit_high no
59 small slight cheap common no
60 small slight cheap low no
61 smart strong expensive high yes
62 smart strong expensive a_bit_high no
63 smart strong expensive common no
64 smart strong expensive low no
65 smart strong a_litle_expensive high no
66 smart strong a_litle_expensive a_bit_high yes
67 smart strong a_litle_expensive common no
68 smart strong a_litle_expensive low no
69 smart strong affordable high yes
70 smart strong affordable a_bit_high yes
71 smart strong affordable common no
72 smart strong affordable low no
73 smart strong substantial high no
74 smart strong substantial a_bit_high no
75 smart strong substantial common no
76 smart strong substantial low no
77 smart strong cheap high no
78 smart strong cheap a_bit_high no
79 smart strong cheap common yes
80 smart strong cheap low no
81 smart elegant expensive high yes
82 smart elegant expensive a_bit_high no
83 smart elegant expensive common yes
84 smart elegant expensive low no
85 smart elegant a_litle_expensive high yes
86 smart elegant a_litle_expensive a_bit_high yes
87 smart elegant a_litle_expensive common yes
88 smart elegant a_litle_expensive low no
89 smart elegant affordable high yes
90 smart elegant affordable a_bit_high yes
91 smart elegant affordable common yes
92 smart elegant affordable low no
93 smart elegant substantial high yes
94 smart elegant substantial a_bit_high yes
95 smart elegant substantial common no
96 smart elegant substantial low no
97 smart elegant cheap high yes
98 smart elegant cheap a_bit_high no
99 smart elegant cheap common yes
100 smart elegant cheap low no
101 smart slight expensive high yes
102 smart slight expensive a_bit_high yes
103 smart slight expensive common no
104 smart slight expensive low no
105 smart slight a_litle_expensive high no
106 smart slight a_litle_expensive a_bit_high yes
107 smart slight a_litle_expensive common no
108 smart slight a_litle_expensive low no
109 smart slight affordable high no
110 smart slight affordable a_bit_high yes
111 smart slight affordable common yes
112 smart slight affordable low no
113 smart slight substantial high yes
114 smart slight substantial a_bit_high yes
115 smart slight substantial common yes
116 smart slight substantial low no
117 smart slight cheap high yes
118 smart slight cheap a_bit_high no
119 smart slight cheap common yes
120 smart slight cheap low no
121 middle strong expensive high yes
122 middle strong expensive a_bit_high no
123 middle strong expensive common yes
124 middle strong expensive low no
125 middle strong a_litle_expensive high no
126 middle strong a_litle_expensive a_bit_high no
127 middle strong a_litle_expensive common yes
128 middle strong a_litle_expensive low no
129 middle strong affordable high yes
130 middle strong affordable a_bit_high yes
131 middle strong affordable common no
132 middle strong affordable low no
133 middle strong substantial high no
134 middle strong substantial a_bit_high no
135 middle strong substantial common no
136 middle strong substantial low no
137 middle strong cheap high no
138 middle strong cheap a_bit_high yes
139 middle strong cheap common yes
140 middle strong cheap low no
141 middle elegant expensive high yes
142 middle elegant expensive a_bit_high no
143 middle elegant expensive common yes
144 middle elegant expensive low no
145 middle elegant a_litle_expensive high no
146 middle elegant a_litle_expensive a_bit_high yes
147 middle elegant a_litle_expensive common no
148 middle elegant a_litle_expensive low no
149 middle elegant affordable high no
150 middle elegant affordable a_bit_high yes
151 middle elegant affordable common yes
152 middle elegant affordable low no
153 middle elegant substantial high yes
154 middle elegant substantial a_bit_high no
155 middle elegant substantial common no
156 middle elegant substantial low no
157 middle elegant cheap high no
158 middle elegant cheap a_bit_high no
159 middle elegant cheap common yes
160 middle elegant cheap low no
161 middle slight expensive high yes
162 middle slight expensive a_bit_high yes
163 middle slight expensive common no
164 middle slight expensive low no
165 middle slight a_litle_expensive high yes
166 middle slight a_litle_expensive a_bit_high no
167 middle slight a_litle_expensive common no
168 middle slight a_litle_expensive low no
169 middle slight affordable high no
170 middle slight affordable a_bit_high yes
171 middle slight affordable common no
172 middle slight affordable low no
173 middle slight substantial high yes
174 middle slight substantial a_bit_high yes
175 middle slight substantial common yes
176 middle slight substantial low no
177 middle slight cheap high no
178 middle slight cheap a_bit_high yes
179 middle slight cheap common yes
180 middle slight cheap low no
181 large strong expensive high yes
182 large strong expensive a_bit_high no
183 large strong expensive common no
184 large strong expensive low no
185 large strong a_litle_expensive high no
186 large strong a_litle_expensive a_bit_high no
187 large strong a_litle_expensive common no
188 large strong a_litle_expensive low no
189 large strong affordable high no
190 large strong affordable a_bit_high no
191 large strong affordable common no
192 large strong affordable low no
193 large strong substantial high no
194 large strong substantial a_bit_high no
195 large strong substantial common no
196 large strong substantial low no
197 large strong cheap high yes
198 large strong cheap a_bit_high no
199 large strong cheap common yes
200 large strong cheap low no
201 large elegant expensive high yes
202 large elegant expensive a_bit_high yes
203 large elegant expensive common yes
204 large elegant expensive low no
205 large elegant a_litle_expensive high yes
206 large elegant a_litle_expensive a_bit_high no
207 large elegant a_litle_expensive common yes
208 large elegant a_litle_expensive low no
209 large elegant affordable high yes
210 large elegant affordable a_bit_high no
211 large elegant affordable common yes
212 large elegant affordable low no
213 large elegant substantial high no
214 large elegant substantial a_bit_high yes
215 large elegant substantial common no
216 large elegant substantial low no
217 large elegant cheap high yes
218 large elegant cheap a_bit_high no
219 large elegant cheap common yes
220 large elegant cheap low no
221 large slight expensive high no
222 large slight expensive a_bit_high no
223 large slight expensive common no
224 large slight expensive low no
225 large slight a_litle_expensive high yes
226 large slight a_litle_expensive a_bit_high yes
227 large slight a_litle_expensive common no
228 large slight a_litle_expensive low no
229 large slight affordable high yes
230 large slight affordable a_bit_high yes
231 large slight affordable common yes
232 large slight affordable low no
233 large slight substantial high yes
234 large slight substantial a_bit_high no
235 large slight substantial common yes
236 large slight substantial low no
237 large slight cheap high no
238 large slight cheap a_bit_high yes
239 large slight cheap common yes
240 large slight cheap low no
end