题目:编程实现基于信息熵进行划分选择的决策树算法,并为西瓜数据集3.0上(P84表4.3)中的数据生成一棵决策树;
代码:
clc;
clear all;
[num,txt]=xlsread('D:\机器学习\WaterMelon_3.0.xlsx');
data=txt(2:end,[2:7,10]);
[rows,cols] = size(data);
for i=1:rows
for j=1:cols
%离散值:Discrete_value
D_value(i,j)=string2num(data(i,j));
end
end
%连续值:Continue_value
C_value=[num(:,[8,9]),D_value(:,7)];
make_tree(D_value,C_value);
%%
%建树
function make_tree(data,data1)
[m,n] = size(data);
[m1,n1] = size(data1);
disp('/');
disp('待分数据集');
disp(data);
disp(data1);
label = data(:,n);
same_class_num = length(find(data(:,n) == label(1,1)));
%退出递归条件
if same_class_num == m ||( n == 1 && n1 == 1)
disp('划分后的数据子集');
disp(data);
disp(data1);
return;
end
[best_feature,midle_data]= choose_bestfeature(data,data1);
if best_feature<=6
disp("待分数据集的最佳特征序号为");
disp(best_feature);
disp("属于待分离散数据集");
elseif best_feature>6
disp("待分数据集的最佳特征序号为");
disp(best_feature-6);
disp("属于待分连续数据集");
disp("最佳增益时的二分点值");
disp(midle_data);
end
%选中特征为离散特征时递归处理
if best_feature<=6
C2D_value=ones(size(data1,1),1);
featvalue = unique(data(:,best_feature));
featvalue_num = length(featvalue);
for i=1:featvalue_num
[subdata,subdata1] = splitData1(data,data1,best_feature,featvalue(i,1),C2D_value);
make_tree(subdata,subdata1);
end
end
%选中特征为离散特征时递归处理
if best_feature>6
C2D_value=C2Dtranlate(data1,best_feature-6,midle_data);
featvalue = unique(C2D_value);
featvalue_num = length(featvalue);
for i=1:featvalue_num
[subdata,subdata1] = splitData1(data,data1,best_feature,featvalue(i,1),C2D_value);
make_tree(subdata,subdata1);
end
end
end
%%
%选择最佳特征
function [best_feature,midle_data]= choose_bestfeature(data,data1)
[m,n] = size(data);
Root_entropy = calc_entropy(data);
midle_data = 0;
best_gain = 0;
best_feature = 0;
% 对于每一列特征
for j=1:n-1
feature_value = unique(data(:,j));
num_f = length(feature_value);
new_entropy = 0;
for i=1:num_f
subdata=splitData(data,j,feature_value(i,1));
[m_s,n_s]=size(subdata);
prob=m_s./m;
new_entropy = new_entropy + prob * calc_entropy(subdata);
end
inf_gain=Root_entropy - new_entropy;
if inf_gain > best_gain
best_gain = inf_gain;
best_feature = j;
end
end
if size(data1,2)>=2
for i=1:size(data1,2)-1
[C_best_gain ,midle_data]=C_value_bestgain(data1,i);
if C_best_gain > best_gain
best_gain = C_best_gain;
best_feature = i+6;
end
end
end
end
%%
%计算连续值特征的信息熵值
function [best_gain,midle_data]= C_value_bestgain(C_value,j)
[m,n]=size(C_value);
C_value_sort=sortrows(C_value,j);
best_gain = 0;
for i= 1:m-1
midle_seris(i,j) = (C_value_sort(i,j)+C_value_sort(i+1,j))/2;
C2D_value = C2Dtranlate(C_value,j,midle_seris(i,j));
C2D_value1 =[C2D_value,C_value(:,n)];
[m1,n1] = size(C2D_value1);
baseentropy = calc_entropy(C_value);
feature_value = unique(C2D_value1(:,j));
num_f = length(feature_value);
new_entropy = 0;
for t= 1:num_f
subdata = splitData(C2D_value1 , j, feature_value(t,1));
[m_s,n_s] = size(subdata);
prob = m_s./m1;
new_entropy = new_entropy + prob * calc_entropy(subdata);
end
inf_gain = baseentropy - new_entropy;
if inf_gain > best_gain
best_gain = inf_gain ;
midle_data = midle_seris(i,j);
end
end
end
%%
%计算信息熵
function [entropy]= calc_entropy(data)
[m,n] = size(data);
label_value = data(:,n);
label = unique(label_value);
label_number = zeros(length(label),2);
label_number(:,1) = label';
for i= 1:length(label)
label_number(i,2) = sum(label_value == label(i));
end
label_number (:,2) = label_number(:,2) ./ m;
entropy = 0;
entropy = sum(-label_number(:,2).*log2 (label_number(:,2)));
end
%%
%分离数据集为子数据集(对离散数据集和连续数据集),用于递归运算
function [subdata,subdata1]= splitData1(data,data1,j,value,c2d_value)
subdata = data;
subdata1 = data1;
if j<=6
subdata(:,j) = [];
k = 0;
for i= 1:size(data,1)
if data(i,j) ~= value
subdata(i-k,:) =[];
subdata1(i-k,:) =[];
k = k + 1;
end
end
elseif j>6
j=j-6;
subdata1(:,j) = [];
k = 0;
for i= 1:size(data1,1)
if c2d_value(i) ~= value
subdata1(i-k,:) =[];
subdata(i-k,:) =[];
k = k + 1;
end
end
end
end
%%
%分离数据集为子数据集(对离散数据集),用于过程计算
function [subdata]= splitData(data, j, value)
subdata = data;
subdata(:,j) = [];
k = 0;
for i= 1:size(data,1)
if data(i,j) ~= value
subdata(i-k,:) =[];
k = k + 1;
end
end
end
%%
%连续值转化离散值
function [C2D_value]= C2Dtranlate(C_value,j,midle_data)
[m,n] = size(C_value);
for k= 1:m
if C_value(k,j)< midle_data
C2D_value(k,j)=0;
elseif C_value(k,j)> midle_data
C2D_value(k,j)=1;
end
end
end
%%
%处理原字符数据为矩阵表达
function num= string2num(string)
if strcmp(string,'浅白') ||strcmp(string,'硬挺') ||strcmp(string,'清脆')||strcmp(string,'模糊')||strcmp(string,'平坦')||strcmp(string,'软粘')||strcmp(string,'否')
num=0;
elseif strcmp(string,'青绿') ||strcmp(string,'稍蜷') ||strcmp(string,'沉闷')||strcmp(string,'稍糊')||strcmp(string,'稍凹')||strcmp(string,'硬滑')||strcmp(string,'是')
num=1;
else
num=2;
end
end
生成决策树:
西瓜数据集Excel文件到这里去找:
https://blog.youkuaiyun.com/macunshi/article/details/80756016
我的代码是参考这位https://www.cnblogs.com/Kermit-Li/p/4503427.html博客里代码改进的,原ID3只能处理如色泽、根蒂、敲声、纹理等离散值,不能处理密度、含糖率这样的连续值,我在原代码的基础上增加了对连续值的处理。
上面的博客里有对离散值属性信息增益的计算过程,而对连续值的计算过程详解见这位的博客:
https://blog.youkuaiyun.com/leafage_m/article/details/80137305