机器学习实战-决策树笔记

matrix1 = np.array([[2,3],[1,2]])
#dic also has len()
len(matrix1)
2
np.log2(np.array([10,10]))
array([ 3.32192809,  3.32192809])
type([[1,2],[3,4]])
list
dic = {'key':2}
key = 'key'
#cannot use dic.setdefult(key,0) += 1
dic[key] = dic.setdefault(key,0) + 1
dic
{'key': 3}
from math import log
#type(dataSet) is list
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        labelCounts[currentLabel] = labelCounts.setdefault(currentLabel,0) + 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * np.log2(prob)
    return shannonEnt
def createDataSet():
    dataSet = [[1,1,'yes'],
              [1,1,'yes'],
              [1,0,'no'],
              [0,1,'no'],
              [0,1,'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels
myDat, labels = createDataSet()
calcShannonEnt(myDat)
0.97095059445466858
import copy
myData = copy.deepcopy(myDat)
myData[0][-1] = 'maybe'
myDat
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
calcShannonEnt(myData)
1.3709505944546687
def splitDataSet(dataSet, axis, value):
    result = []
    for data in dataSet:
        if data[axis] == value:
            temp = data[:axis]
            temp.extend(data[axis+1:])
            result.append(temp)
    return result
splitDataSet(myDat, 0, 1)
[[1, 'yes'], [1, 'yes'], [0, 'no']]
#np.shape can get 2-D list shape
np.shape([[1,2],[2,3]])
(2, 2)
[[1,2],[2,2]].shape()
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-163-db3819727eb2> in <module>()
----> 1 [[1,2],[2,2]].shape()


AttributeError: 'list' object has no attribute 'shape'
def chooseBestFeatureToSplit(dataSet):
    #get number of row and remove label - 1
    numFeatures = np.shape(dataSet)[1] - 1
    baseEntropy = calcShannonEnt(dataSet)
    #get number of column
    numDataSet = np.shape(dataSet)[0]
    bestInfoGain = 0.0
    bestFeature = -1
    #traverse (bianli) features
    for i in range(numFeatures):
        #get column data : one column feature
        featureList = [example[i] for example in dataSet]
        uniqueVals = set(featureList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = np.shape(subDataSet)[0]/float(numDataSet)
            #calculate condition entropy:
            #H(Y|X) = sum (i = 1 to n, Pi *(H (Y|X = xi)))
            newEntropy += prob * calcShannonEnt(subDataSet)
        #information Gain : H(D) - H(D|A)    
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature
chooseBestFeatureToSplit(myDat)
0
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        classCount[vote] = classCount.setdefault(vote, 0) + 1
    sortedClassCount = sorted(classCount.items(), key = lambda x:x[1], reverse = True)
    return sortedClassCount[0][0]
def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[-1]) == np.shape(classList)[0]:
        return classList[0]
    #finish traverse dataSet return most of class
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    #get the number of best Feature : number of column
    bestNumFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatureLabel = labels[bestNumFeat]
    myTree = {bestFeatureLabel:{}}
    del(labels[bestNumFeat])
    # get best feature values
    featValues = [example[bestNumFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        #{'no surfacing':{0 = createTree(xx), 1:{createTree(xx)} } }
        #when 0, get the same two class'no' , fit line 3, return 'no'
        #when 1, continue.
        myTree[bestFeatureLabel][value] = createTree(splitDataSet(dataSet, bestNumFeat, value)\
                                                     ,subLabels)
    return myTree
myDat, labels = createDataSet()
print(myDat)
print(labels)
myTree = createTree(myDat, labels)
print(myTree)
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
['no surfacing', 'flippers']
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
#python3 dict.keys() return a iterator not list 
key = myTree.keys()
print(type(key))
#can use list()
keylist = list(myTree.keys())
print(keylist)
#can use [x for x in dic.keys()]
[key for key in myTree.keys()]
<class 'dict_keys'>
['no surfacing']





['no surfacing']
myDat, labels = createDataSet()
labels.index(key)
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-179-3445e9f9fb6a> in <module>()
      1 myDat, labels = createDataSet()
----> 2 labels.index(key)


ValueError: 'key' is not in list
def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    #get index to fit featLabels and testVec, both list
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            #if type(secondDict[key]) == dict ,continue
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel
classify(myTree, labels, [1,0])
'no'
#stroe myTree
def storeTree(inputTree, filename):
    import pickle
    #need 'wb' not 'w' binary
    with open(filename, 'wb') as fw:
        pickle.dump(inputTree, fw)
        fw.close()

def grabTree(filename):
    import pickle
    #need 'rb' not 'r' binary
    with open(filename, 'rb') as fr:
        return pickle.load(fr)
storeTree(myTree, 'classifierStorage.txt')
grabTree('classifierStorage.txt')
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
import numpy as np
import pandas as pd
filedir = '/Users/apple/Documents/test.txt'
df = pd.read_csv(filedir, sep=' ',header = None, encoding = 'utf-8')
df
012
0123211.0
112NaN
249NaN
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值