defsplitDataSet(dataSet, axis, value):
result = []
for data in dataSet:
if data[axis] == value:
temp = data[:axis]
temp.extend(data[axis+1:])
result.append(temp)
return result
splitDataSet(myDat, 0, 1)
[[1, 'yes'], [1, 'yes'], [0, 'no']]
#np.shape can get 2-D list shape
np.shape([[1,2],[2,3]])
(2, 2)
[[1,2],[2,2]].shape()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-163-db3819727eb2> in <module>()
----> 1 [[1,2],[2,2]].shape()
AttributeError: 'list' object has no attribute 'shape'
defchooseBestFeatureToSplit(dataSet):#get number of row and remove label - 1
numFeatures = np.shape(dataSet)[1] - 1
baseEntropy = calcShannonEnt(dataSet)
#get number of column
numDataSet = np.shape(dataSet)[0]
bestInfoGain = 0.0
bestFeature = -1#traverse (bianli) featuresfor i in range(numFeatures):
#get column data : one column feature
featureList = [example[i] for example in dataSet]
uniqueVals = set(featureList)
newEntropy = 0.0for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = np.shape(subDataSet)[0]/float(numDataSet)
#calculate condition entropy:#H(Y|X) = sum (i = 1 to n, Pi *(H (Y|X = xi)))
newEntropy += prob * calcShannonEnt(subDataSet)
#information Gain : H(D) - H(D|A)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
defcreateTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[-1]) == np.shape(classList)[0]:
return classList[0]
#finish traverse dataSet return most of classif len(dataSet[0]) == 1:
return majorityCnt(classList)
#get the number of best Feature : number of column
bestNumFeat = chooseBestFeatureToSplit(dataSet)
bestFeatureLabel = labels[bestNumFeat]
myTree = {bestFeatureLabel:{}}
del(labels[bestNumFeat])
# get best feature values
featValues = [example[bestNumFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
#{'no surfacing':{0 = createTree(xx), 1:{createTree(xx)} } }#when 0, get the same two class'no' , fit line 3, return 'no'#when 1, continue.
myTree[bestFeatureLabel][value] = createTree(splitDataSet(dataSet, bestNumFeat, value)\
,subLabels)
return myTree
#python3 dict.keys() return a iterator not list
key = myTree.keys()
print(type(key))
#can use list()
keylist = list(myTree.keys())
print(keylist)
#can use [x for x in dic.keys()]
[key for key in myTree.keys()]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-179-3445e9f9fb6a> in <module>()
1 myDat, labels = createDataSet()
----> 2 labels.index(key)
ValueError: 'key' is not in list
defclassify(inputTree, featLabels, testVec):
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
#get index to fit featLabels and testVec, both list
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
#if type(secondDict[key]) == dict ,continueif type(secondDict[key]).__name__ == 'dict':
classLabel = classify(secondDict[key], featLabels, testVec)
else:
classLabel = secondDict[key]
return classLabel
classify(myTree, labels, [1,0])
'no'
#stroe myTreedefstoreTree(inputTree, filename):import pickle
#need 'wb' not 'w' binarywith open(filename, 'wb') as fw:
pickle.dump(inputTree, fw)
fw.close()
defgrabTree(filename):import pickle
#need 'rb' not 'r' binarywith open(filename, 'rb') as fr:
return pickle.load(fr)