机器学习实战-决策树笔记

最新推荐文章于 2022-11-23 15:53:11 发布

eclipSYcn

最新推荐文章于 2022-11-23 15:53:11 发布

阅读量523

点赞数

CC 4.0 BY-SA版权

分类专栏： python 机器学习文章标签：机器学习 python 决策树

本文链接：https://blog.youkuaiyun.com/Eclipsesy/article/details/77507901

python 同时被 2 个专栏收录

21 篇文章

订阅专栏

机器学习

5 篇文章

订阅专栏

matrix1 = np.array([[2,3],[1,2]])

#dic also has len()
len(matrix1)

np.log2(np.array([10,10]))

array([ 3.32192809,  3.32192809])

type([[1,2],[3,4]])

list

dic = {'key':2}
key = 'key'
#cannot use dic.setdefult(key,0) += 1
dic[key] = dic.setdefault(key,0) + 1
dic

{'key': 3}

from math import log
#type(dataSet) is list
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        labelCounts[currentLabel] = labelCounts.setdefault(currentLabel,0) + 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * np.log2(prob)
    return shannonEnt

def createDataSet():
    dataSet = [[1,1,'yes'],
              [1,1,'yes'],
              [1,0,'no'],
              [0,1,'no'],
              [0,1,'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels

myDat, labels = createDataSet()

calcShannonEnt(myDat)

0.97095059445466858

import copy
myData = copy.deepcopy(myDat)
myData[0][-1] = 'maybe'
myDat

[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]

calcShannonEnt(myData)

1.3709505944546687

def splitDataSet(dataSet, axis, value):
    result = []
    for data in dataSet:
        if data[axis] == value:
            temp = data[:axis]
            temp.extend(data[axis+1:])
            result.append(temp)
    return result

splitDataSet(myDat, 0, 1)

[[1, 'yes'], [1, 'yes'], [0, 'no']]

#np.shape can get 2-D list shape
np.shape([[1,2],[2,3]])

(2, 2)

[[1,2],[2,2]].shape()

---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-163-db3819727eb2> in <module>()
----> 1 [[1,2],[2,2]].shape()


AttributeError: 'list' object has no attribute 'shape'

def chooseBestFeatureToSplit(dataSet):
    #get number of row and remove label - 1
    numFeatures = np.shape(dataSet)[1] - 1
    baseEntropy = calcShannonEnt(dataSet)
    #get number of column
    numDataSet = np.shape(dataSet)[0]
    bestInfoGain = 0.0
    bestFeature = -1
    #traverse (bianli) features
    for i in range(numFeatures):
        #get column data : one column feature
        featureList = [example[i] for example in dataSet]
        uniqueVals = set(featureList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = np.shape(subDataSet)[0]/float(numDataSet)
            #calculate condition entropy:
            #H(Y|X) = sum (i = 1 to n, Pi *(H (Y|X = xi)))
            newEntropy += prob * calcShannonEnt(subDataSet)
        #information Gain : H(D) - H(D|A)    
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

chooseBestFeatureToSplit(myDat)

def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        classCount[vote] = classCount.setdefault(vote, 0) + 1
    sortedClassCount = sorted(classCount.items(), key = lambda x:x[1], reverse = True)
    return sortedClassCount[0][0]

def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[-1]) == np.shape(classList)[0]:
        return classList[0]
    #finish traverse dataSet return most of class
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    #get the number of best Feature : number of column
    bestNumFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatureLabel = labels[bestNumFeat]
    myTree = {bestFeatureLabel:{}}
    del(labels[bestNumFeat])
    # get best feature values
    featValues = [example[bestNumFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        #{'no surfacing':{0 = createTree(xx), 1:{createTree(xx)} } }
        #when 0, get the same two class'no' , fit line 3, return 'no'
        #when 1, continue.
        myTree[bestFeatureLabel][value] = createTree(splitDataSet(dataSet, bestNumFeat, value)\
                                                     ,subLabels)
    return myTree

myDat, labels = createDataSet()
print(myDat)
print(labels)
myTree = createTree(myDat, labels)
print(myTree)

[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
['no surfacing', 'flippers']
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

#python3 dict.keys() return a iterator not list 
key = myTree.keys()
print(type(key))
#can use list()
keylist = list(myTree.keys())
print(keylist)
#can use [x for x in dic.keys()]
[key for key in myTree.keys()]

<class 'dict_keys'>
['no surfacing']





['no surfacing']

myDat, labels = createDataSet()
labels.index(key)

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-179-3445e9f9fb6a> in <module>()
      1 myDat, labels = createDataSet()
----> 2 labels.index(key)


ValueError: 'key' is not in list

def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    #get index to fit featLabels and testVec, both list
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            #if type(secondDict[key]) == dict ,continue
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

classify(myTree, labels, [1,0])

'no'

#stroe myTree
def storeTree(inputTree, filename):
    import pickle
    #need 'wb' not 'w' binary
    with open(filename, 'wb') as fw:
        pickle.dump(inputTree, fw)
        fw.close()

def grabTree(filename):
    import pickle
    #need 'rb' not 'r' binary
    with open(filename, 'rb') as fr:
        return pickle.load(fr)

storeTree(myTree, 'classifierStorage.txt')
grabTree('classifierStorage.txt')

{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

import numpy as np

import pandas as pd

filedir = '/Users/apple/Documents/test.txt'
df = pd.read_csv(filedir, sep=' ',header = None, encoding = 'utf-8')

df

	0	1	2
0	123	21	1.0
1	1	2	NaN
2	4	9	NaN