数据集:西瓜集3.0(来自西瓜书《机器学习 周志华》84页)
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import math
# 参考:https://blog.youkuaiyun.com/weixin_66845445/article/details/138135601
class Bayes():
# 加载数据
def __init__(self):
self.dataSet = dataSet=[['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460, '好瓜'],
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.774, 0.376, '好瓜'],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.634, 0.264, '好瓜'],
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.608, 0.318, '好瓜'],
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.556, 0.215, '好瓜'],
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.403, 0.237, '好瓜'],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', 0.481, 0.149, '好瓜'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', 0.437, 0.211, '好瓜'],
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', 0.666, 0.091, '坏瓜'],
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', 0.243, 0.267, '坏瓜'],
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057, '坏瓜'],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', 0.343, 0.099, '坏瓜'],
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', 0.639, 0.161, '坏瓜'],
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0.657, 0.198, '坏瓜'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.360, 0.370, '坏瓜'],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0.593, 0.042, '坏瓜'],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0.719, 0.103, '坏瓜']]
self.testSet= ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460] # 待测集
self.feature_names = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖率'] # 特征
self.label_names = sorted(list(set([item[-1] for item in dataSet])),reverse=True)
self.countAll = len(list(dataSet))
self.countG = len(list(item for item in dataSet if (item[-1] == self.label_names[0])))
self.countB = len(list(item for item in dataSet if (item[-1] == self.label_names[1])))
# 计算均值、标准差
def mean_std(self,feature_name, label_name):
data = [item[self.feature_names.index(feature_name)] for item in self.dataSet if (item[-1] == label_name)]
mean = round(np.mean(data), 4)
std = round(np.std(data), 4)
return mean, std
# 计算离散属性的条件概率P(xi|c)
def P(self,index, label_name):
a = [item for item in self.dataSet if (item[-1] == label_name) & (item[index] == self.testSet[index])]
P = round(len(a) / (self.countG if label_name == self.label_names[0] else self.countB), 4)
return P
def fit(self, testSet):
# 计算先验概率P(c)
P_G = round(self.countG / self.countAll, 4)
P_B = round(self.countB / self.countAll, 4)
# 计算离散属性的条件概率
P0_G = self.P(0, self.label_names[0])
P0_B = self.P(0, self.label_names[1])
P1_G = self.P(1, self.label_names[0])
P1_B = self.P(1, self.label_names[1])
P2_G = self.P(2, self.label_names[0])
P2_B = self.P(2, self.label_names[1])
P3_G = self.P(3, self.label_names[0])
P3_B = self.P(3, self.label_names[1])
P4_G = self.P(4, self.label_names[0])
P4_B = self.P(4, self.label_names[1])
P5_G = self.P(5, self.label_names[0])
P5_B = self.P(5, self.label_names[1])
denG_mean, denG_std = self.mean_std("密度", self.label_names[0])
denB_mean, denB_std = self.mean_std("密度", self.label_names[1])
sugG_mean, sugG_std = self.mean_std("含糖率", self.label_names[0])
sugB_mean, sugB_std = self.mean_std("含糖率", self.label_names[1])
p_density_G = (1 / (math.sqrt(2 * np.pi)* denG_std)) * np.exp(-1* (((testSet[self.feature_names.index("密度")] - denG_mean)**2) / (2 * (denG_std**2))))
p_density_B = (1 / (math.sqrt(2 * np.pi)* denB_std)) * np.exp(-1* (((testSet[self.feature_names.index("密度")] - denB_mean)**2) / (2 * (denB_std**2))))
p_sug_G = (1 / (math.sqrt(2 * np.pi)* sugG_std)) * np.exp(-1* (((testSet[self.feature_names.index("含糖率")] - sugG_mean)**2) / (2 * (sugG_std**2))))
p_sug_B = (1 / (math.sqrt(2 * np.pi)* sugB_std)) * np.exp(-1* (((testSet[self.feature_names.index("含糖率")] - sugB_mean)**2) / (2 * (sugB_std**2))))
isGood = P_G * P0_G * P1_G * P2_G * P3_G * P4_G * P5_G * p_density_G * p_sug_G
isBad = P_B * P0_B * P1_B * P2_B * P3_B * P4_B * P5_B * p_density_B * p_sug_B
return isGood,isBad
if __name__ == '__main__':
testSet= ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460] # 待测集
by = Bayes()
test_df = pd.DataFrame([testSet], columns=by.feature_names,index=None)
print("=======================待测样本========================")
print(f"待测集:\n{test_df}")
isGood,isBad = by.fit(testSet)
print("=======================后验概率========================")
print("后验概率:")
print(f"P(好瓜|xi) = {isGood}")
print(f"P(好瓜|xi) = {isBad}")
print("=======================预测结果========================")
print("predict ---> 好瓜" if (isGood > isBad) else "predict ---> 坏瓜")
输出结果:
=======================待测样本========================
待测集:
色泽 根蒂 敲声 纹理 脐部 触感 密度 含糖率
0 青绿 蜷缩 浊响 清晰 凹陷 硬滑 0.697 0.46
=======================后验概率========================
后验概率:
P(好瓜|xi) = 0.04461081360529604
P(好瓜|xi) = 4.350445841831158e-05
=======================预测结果========================
predict ---> 好瓜
参考:https://blog.youkuaiyun.com/weixin_66845445/article/details/138135601