import pandas
import numpy as np
columns = ["age" , "workclass" , "fnlwgt" , "education" , "education_num" , "marital_status" , "occupation" , "relationship" , "race" , "sex" ,
"capital_gain" , "capital_loss" , "hours_per_week" , "native_country" , "high_income" ]
income = pandas.read_csv("D:\\test\machineLearning\income.csv" , names=columns)
print(income.head(2 ))
age workclass fnlwgt education education_num \
0 39 State-gov 77516 Bachelors 13
1 50 Self-emp-not-inc 83311 Bachelors 13
marital_status occupation relationship race sex \
0 Never-married Adm-clerical Not-in-family White Male
1 Married-civ-spouse Exec-managerial Husband White Male
capital_gain capital_loss hours_per_week native_country high_income
0 2174 0 40 United-States <=50K
1 0 0 13 United-States <=50K
print income["workclass" ].unique()
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
' ?' ' Self-emp-inc']
col = pandas.Categorical.from_array(income["workclass" ])
print col
print col.codes
[State-gov, Self-emp-not-inc, Private, Private, Private, ..., Private, Private, Private, Self-emp-not-inc, Private]
Length: 629
Categories (7, object): [?, Federal-gov, Local-gov, Private, Self-emp-inc, Self-emp-not-inc, State-gov]
[6 5 3 3 3 3 3 5 3 3 3 6 3 3 3 3 5 3 3 5 3 3 1 3 3 2 3 0 3 3 2 3 3 1 6 3 3
3 3 5 3 5 3 3 3 1 3 3 6 3 3 3 3 1 4 3 3 3 3 3 3 0 3 3 3 3 3 3 4 0 3 3 5 3
3 3 3 0 3 2 3 3 3 3 3 3 2 3 3 1 3 3 3 3 2 2 5 3 3 1 3 3 5 3 3 4 0 3 2 3 3
3 5 3 3 3 4 2 3 3 3 3 3 6 3 3 3 3 0 3 3 3 5 3 3 1 5 3 3 3 4 3 3 3 3 3 3 3
3 0 2 3 3 3 0 3 3 5 3 3 0 3 4 3 5 3 1 6 3 2 3 6 3 3 6 3 3 3 3 3 2 3 3 3 1
3 5 0 3 6 3 3 2 3 1 3 3 1 3 5 3 0 3 2 6 3 3 3 4 3 2 3 3 3 4 3 3 3 3 3 3 0
3 3 2 3 0 5 3 3 3 3 3 3 3 6 6 3 2 3 3 5 6 0 3 3 3 3 3 3 3 3 2 2 3 5 3 3 3
3 3 3 3 3 3 4 0 3 3 3 3 3 5 2 3 3 3 3 3 3 3 2 3 3 3 1 3 3 3 3 3 3 2 3 3 3
4 0 3 3 3 3 3 3 3 3 3 3 3 4 3 3 0 5 3 3 5 3 3 3 5 3 3 3 5 5 0 3 5 3 3 3 3
2 3 3 3 3 3 3 3 3 6 3 6 5 0 0 3 3 3 3 3 2 0 1 3 3 3 5 3 3 3 3 3 3 5 3 3 3
1 3 6 3 3 2 3 3 5 3 3 3 3 3 2 3 3 3 3 3 3 3 5 6 3 3 3 0 3 4 3 3 6 3 3 3 3
5 0 3 2 3 3 3 6 3 3 3 2 6 3 3 6 3 3 3 1 3 4 2 0 0 5 3 3 3 2 2 3 2 3 3 1 3
3 3 3 3 3 0 3 4 3 3 5 3 3 3 3 0 3 2 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3
5 6 3 0 3 0 2 2 3 3 1 3 3 3 3 3 3 3 0 2 2 3 3 3 3 3 3 2 3 3 0 3 3 3 0 3 0
0 3 4 3 2 2 3 3 3 3 5 5 3 3 3 3 3 2 3 3 4 0 3 3 3 3 3 3 3 2 3 3 5 5 3 3 1
3 3 3 3 3 3 3 3 3 5 3 4 3 5 3 3 3 3 3 3 3 0 5 2 5 0 3 3 3 5 3 3 3 3 4 0 0
3 3 3 3 0 5 3 3 3 3 3 2 5 3 3 3 2 3 3 3 3 2 3 3 3 5 4 3 3 3 3 3 3 3 3 5 3]
income["workclass" ]=col.codes
for name in ["education" , "marital_status" , "occupation" , "relationship" , "race" , "sex" , "native_country" , "high_income" ]:
col = pandas.Categorical.from_array(income[name])
income[name]=col.codes
private_income = income[income["workclass" ]==4 ]
public_income = income[income["workclass" ]!=4 ]
print private_income.head(2 )
print public_income.head(2 )
age workclass fnlwgt education education_num marital_status \
54 47 4 109832 11 9 0
68 49 4 191681 15 10 2
occupation relationship race sex capital_gain capital_loss \
54 4 1 4 1 0 0
68 4 0 4 1 0 0
hours_per_week native_country high_income
54 60 26 0
68 50 26 1
age workclass fnlwgt education education_num marital_status \
0 39 6 77516 9 13 4
1 50 5 83311 9 13 2
occupation relationship race sex capital_gain capital_loss \
0 1 1 4 1 2174 0
1 4 0 4 1 0 0
hours_per_week native_country high_income
0 40 26 0
1 13 26 0
import math
import numpy as np
def calc_entropy (column) :
count = np.bincount(column)
probality = count/float(len(column))
entropy=0
for pro in probality:
if pro>0 :
entropy += pro*math.log(pro,2 )
return -entropy
calc_entropy([1 ,1 ,0 ,0 ,1 ])
0.97095059445466858
high_entropy=calc_entropy(income["high_income" ])
median_age=income["age" ].median()
left_age = income[income["age" ]<=median_age]
right_age = income[income["age" ]>median_age]
info_gain = high_entropy - (left_age.shape[0 ]/len(income) * calc_entropy(left_age["high_income" ]) +
right_age.shape[0 ]/len(income) * calc_entropy(right_age["high_income" ]))
print info_gain
0.756141116271
print np.bincount([1 ,1 ,0 ,0 ,1 ,3 ,3 ,5 ])
[2 3 0 2 0 1]
def calc_information_gain (data,split_name,target_name) :
high_entropy=calc_entropy(income[target_name])
median_age=income[split_name].median()
left_age = income[income[split_name]<=median_age]
right_age = income[income[split_name]>median_age]
to_sub_result=0
for sub in [left_age,right_age]:
proba = sub.shape[0 ]/float(len(income))
to_sub_result += proba*calc_entropy(sub[target_name])
return high_entropy-to_sub_result
gain=calc_information_gain(income,"age" ,"high_income" )
print gain
columns = ["age" , "workclass" , "education_num" , "marital_status" , "occupation" , "relationship" , "race" , "sex" , "hours_per_week" , "native_country" ]
information_gains = []
for sub in columns:
gain=calc_information_gain(income,sub,"high_income" )
information_gains.append(gain)
index = information_gains.index(max(information_gains))
print information_gains[index]
0.0501271848501
0.125533153029
def find_best_column (data, target_name, columns) :
for sub in columns:
gain=calc_information_gain(data,sub,target_name)
information_gains.append(gain)
index = information_gains.index(max(information_gains))
return information_gains[index]
find_best_column(income,"high_income" ,columns)
0.12553315302923063