主题:朴素贝叶斯
类别:个人机器学习笔记(数学推导见上传的手写PDF)
书籍参考:《机器学习实战》、《统计学习》、《机器学习》
import numpy as np
import pandas as pd
import math
创建数据集:
def createDataXG20():
data = np.array([['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
, ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
, ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
, ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘']
, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑']
, ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑']
, ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘']
, ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑']
, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘']
, ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑']
, ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑']
, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑']
, ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑']])
label = np.array(['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否'])
feature = np.array(['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'])
return data,label,feature
testdata = ['青绿','蜷缩','浊响','清晰','凹陷','硬滑']
data,label,feature = createDataXG20()
len_feature = len(feature); len_data = len(label)
# 离散数据NB函数定义
def discreteFunction(data,label,features,testdata):
classification = np.unique(label).tolist()
Plist = list(0 for i in range(len(classification))) #指定生成一个长度为len(classification)的列表,节省空间
for i in classification:
Pclassification = len(label[label==i])/len(label)
multi = 1
temp_data = data[label==i]
for j in testdata:
c = len(temp_data[temp_data==j])/len(temp_data) # 在label为是(否)的条件下,某个特征满足测试数据集的概率
multi *= c
multi *= Pclassification
ind = classification.index(i)
Plist[ind] = multi
return Plist,classification
discreteFunction(data,label,feature,testdata)
([0.0008607009333225744, 0.03392836626838235], [‘否’, ‘是’])
可以看到是好瓜的概率远远超出坏瓜的概率,所以预测为好瓜。
def createDataXG():
data = np.array([['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑',0.697,0.460],
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑',0.774,0.376],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑',0.634,0.264],
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑',0.608,0.318],
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑',0.556,0.215],
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘',0.403,0.237],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘',0.481,0.149],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑',0.437,0.211],
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑',0.666,0.091],
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘',0.243,0.267],
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑',0.245,0.057],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘',0.343,0.099],
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑',0.639,0.161],
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑',0.657,0.198],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘',0.360,0.370],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑',0.593,0.042],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑',0.719,0.103]])
label = np.array(['好瓜','好瓜','好瓜','好瓜','好瓜','好瓜','好瓜','好瓜','坏瓜','坏瓜','坏瓜','坏瓜','坏瓜','坏瓜','坏瓜','坏瓜','坏瓜'])
feature = np.array(['色泽', '根蒂', '敲声', '纹理', '脐部', '触感','密度', '含糖量'])
return data,label,feature
testdata = ['青绿','蜷缩','浊响','清晰','凹陷','硬滑',0.697,0.460]
data,label,feature = createDataXG()
len_feature = len(feature); len_data = len(label)
def NFunction(m,x,miu,sigma): # 连续数据的概率计算用N(\miu,\sigma^{2})计算
return (1/math.sqrt(2)/math.sqrt(math.pi)/sigma) * math.exp(-(miu-x)**2/2/(sigma**2))
def continuousFunction(data,label,features,testdata):
classification = np.unique(label).tolist()
Plist = list(0 for i in range(len(classification))) #指定生成一个长度为len(classification)的列表,节省空间
for i in classification:
Pclassification = len(label[label==i])/len(label)
multi = 1
temp_data = data[label==i]
for j in testdata:
if type(j) == str:
c = len(temp_data[temp_data==j])/len(temp_data) # 在label为是(否)的条件下,某个特征满足测试数据集的概率
print(c)
multi *= c
else:
jindex = testdata.index(j)
jdata = np.array([float(k) for k in temp_data[:,jindex]])
m = len(jdata)
miu = np.mean(jdata)
sum = 0
sigma = np.std(jdata)
Np = NFunction(m,j,miu,sigma)
print(Np)
multi *= Np
multi *= Pclassification
ind = classification.index(i)
Plist[ind] = multi
return Plist,classification
continuousFunction(data,label,feature,testdata)
([4.365876684002464e-05, 0.044552310283477066], [‘坏瓜’, ‘好瓜’])
好瓜!