数据预处理与信息增益计算-优快云博客

本文链接：https://blog.youkuaiyun.com/qiujiahao123/article/details/62889289

import pandas
import numpy as np

# Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age).
columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex", 
           "capital_gain", "capital_loss", "hours_per_week", "native_country", "high_income"]
income = pandas.read_csv("D:\\test\machineLearning\income.csv", names=columns)
print(income.head(2))

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   

        marital_status        occupation    relationship    race    sex  \
0        Never-married      Adm-clerical   Not-in-family   White   Male   
1   Married-civ-spouse   Exec-managerial         Husband   White   Male   

   capital_gain  capital_loss  hours_per_week  native_country high_income  
0          2174             0              40   United-States       <=50K  
1             0             0              13   United-States       <=50K

#去重
print income["workclass"].unique()

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc']

#将"workclass"下的数据根据类别转换称数字
col = pandas.Categorical.from_array(income["workclass"])
print col
#将值显示出来
print col.codes

[State-gov, Self-emp-not-inc, Private, Private, Private, ..., Private, Private, Private, Self-emp-not-inc, Private]
Length: 629
Categories (7, object): [?, Federal-gov, Local-gov, Private, Self-emp-inc, Self-emp-not-inc, State-gov]
[6 5 3 3 3 3 3 5 3 3 3 6 3 3 3 3 5 3 3 5 3 3 1 3 3 2 3 0 3 3 2 3 3 1 6 3 3
 3 3 5 3 5 3 3 3 1 3 3 6 3 3 3 3 1 4 3 3 3 3 3 3 0 3 3 3 3 3 3 4 0 3 3 5 3
 3 3 3 0 3 2 3 3 3 3 3 3 2 3 3 1 3 3 3 3 2 2 5 3 3 1 3 3 5 3 3 4 0 3 2 3 3
 3 5 3 3 3 4 2 3 3 3 3 3 6 3 3 3 3 0 3 3 3 5 3 3 1 5 3 3 3 4 3 3 3 3 3 3 3
 3 0 2 3 3 3 0 3 3 5 3 3 0 3 4 3 5 3 1 6 3 2 3 6 3 3 6 3 3 3 3 3 2 3 3 3 1
 3 5 0 3 6 3 3 2 3 1 3 3 1 3 5 3 0 3 2 6 3 3 3 4 3 2 3 3 3 4 3 3 3 3 3 3 0
 3 3 2 3 0 5 3 3 3 3 3 3 3 6 6 3 2 3 3 5 6 0 3 3 3 3 3 3 3 3 2 2 3 5 3 3 3
 3 3 3 3 3 3 4 0 3 3 3 3 3 5 2 3 3 3 3 3 3 3 2 3 3 3 1 3 3 3 3 3 3 2 3 3 3
 4 0 3 3 3 3 3 3 3 3 3 3 3 4 3 3 0 5 3 3 5 3 3 3 5 3 3 3 5 5 0 3 5 3 3 3 3
 2 3 3 3 3 3 3 3 3 6 3 6 5 0 0 3 3 3 3 3 2 0 1 3 3 3 5 3 3 3 3 3 3 5 3 3 3
 1 3 6 3 3 2 3 3 5 3 3 3 3 3 2 3 3 3 3 3 3 3 5 6 3 3 3 0 3 4 3 3 6 3 3 3 3
 5 0 3 2 3 3 3 6 3 3 3 2 6 3 3 6 3 3 3 1 3 4 2 0 0 5 3 3 3 2 2 3 2 3 3 1 3
 3 3 3 3 3 0 3 4 3 3 5 3 3 3 3 0 3 2 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3
 5 6 3 0 3 0 2 2 3 3 1 3 3 3 3 3 3 3 0 2 2 3 3 3 3 3 3 2 3 3 0 3 3 3 0 3 0
 0 3 4 3 2 2 3 3 3 3 5 5 3 3 3 3 3 2 3 3 4 0 3 3 3 3 3 3 3 2 3 3 5 5 3 3 1
 3 3 3 3 3 3 3 3 3 5 3 4 3 5 3 3 3 3 3 3 3 0 5 2 5 0 3 3 3 5 3 3 3 3 4 0 0
 3 3 3 3 0 5 3 3 3 3 3 2 5 3 3 3 2 3 3 3 3 2 3 3 3 5 4 3 3 3 3 3 3 3 3 5 3]

income["workclass"]=col.codes
#同理转换以下数据
for name in ["education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"]:
    col = pandas.Categorical.from_array(income[name])
    income[name]=col.codes

private_income = income[income["workclass"]==4]
public_income = income[income["workclass"]!=4]
print private_income.head(2)
print public_income.head(2)

    age  workclass  fnlwgt  education  education_num  marital_status  \
54   47          4  109832         11              9               0   
68   49          4  191681         15             10               2   

    occupation  relationship  race  sex  capital_gain  capital_loss  \
54           4             1     4    1             0             0   
68           4             0     4    1             0             0   

    hours_per_week  native_country  high_income  
54              60              26            0  
68              50              26            1  
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          6   77516          9             13               4   
1   50          5   83311          9             13               2   

   occupation  relationship  race  sex  capital_gain  capital_loss  \
0           1             1     4    1          2174             0   
1           4             0     4    1             0             0   

   hours_per_week  native_country  high_income  
0              40              26            0  
1              13              26            0

import math
import numpy as np
#本函数计算熵
def calc_entropy(column):
    #取出column里每一种数字出现的次数
    count = np.bincount(column)
    #将每一钟数字出现的次数除以总的数量，即为概率
    probality = count/float(len(column))
    entropy=0
    for pro in probality:
        #注意log里pro不能为0，否则会报错
        if pro>0:
            entropy += pro*math.log(pro,2) 
    return -entropy

calc_entropy([1,1,0,0,1])

0.97095059445466858

#求信息增溢
high_entropy=calc_entropy(income["high_income"])
#求中间数
median_age=income["age"].median()
left_age = income[income["age"]<=median_age]
right_age = income[income["age"]>median_age]

info_gain = high_entropy - (left_age.shape[0]/len(income) * calc_entropy(left_age["high_income"]) + 
                            right_age.shape[0]/len(income) * calc_entropy(right_age["high_income"]))

print info_gain

0.756141116271

print np.bincount([1,1,0,0,1,3,3,5])
#最小的数0出现了2次，1出现了3次

[2 3 0 2 0 1]

#求信息增溢
def calc_information_gain(data,split_name,target_name):
    high_entropy=calc_entropy(income[target_name])
    #求中间数
    median_age=income[split_name].median()
    left_age = income[income[split_name]<=median_age]
    right_age = income[income[split_name]>median_age]

    to_sub_result=0
    for sub in [left_age,right_age]:
        proba = sub.shape[0]/float(len(income))
        to_sub_result += proba*calc_entropy(sub[target_name])
    return high_entropy-to_sub_result

gain=calc_information_gain(income,"age","high_income")
print gain 

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
information_gains = []
for sub in columns:
    gain=calc_information_gain(income,sub,"high_income")
    information_gains.append(gain)

#在所有的信息增益里取出最大的一个
index = information_gains.index(max(information_gains))

print information_gains[index]

0.0501271848501
0.125533153029

def find_best_column(data, target_name, columns):
    for sub in columns:
        gain=calc_information_gain(data,sub,target_name)
        information_gains.append(gain)

    #在所有的信息增益里取出最大的一个
    index = information_gains.index(max(information_gains))
    return information_gains[index]

find_best_column(income,"high_income",columns)