朴素贝叶斯分类器及Python手写实现（详细）

bo_hai

已于 2024-11-08 14:40:31 修改

阅读量345

点赞数 3

文章标签： python 机器学习

于 2024-11-08 11:21:17 首次发布

本文链接：https://blog.youkuaiyun.com/bo_hai/article/details/143619054

版权

数据集：西瓜集3.0（来自西瓜书《机器学习周志华》84页）

# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import math

# 参考：https://blog.youkuaiyun.com/weixin_66845445/article/details/138135601
class Bayes():

    # 加载数据
    def __init__(self):
        self.dataSet = dataSet=[['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460, '好瓜'],
                ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.774, 0.376, '好瓜'],
                ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.634, 0.264, '好瓜'],
                ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.608, 0.318, '好瓜'],
                ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.556, 0.215, '好瓜'],            
                ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.403, 0.237, '好瓜'],               
                ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', 0.481, 0.149, '好瓜'],                
                ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', 0.437, 0.211, '好瓜'],
                ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', 0.666, 0.091, '坏瓜'],
                ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', 0.243, 0.267, '坏瓜'],
                ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057, '坏瓜'],
                ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', 0.343, 0.099, '坏瓜'],
                ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', 0.639, 0.161, '坏瓜'],  
                ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0.657, 0.198, '坏瓜'],
                ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.360, 0.370, '坏瓜'],
                ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0.593, 0.042, '坏瓜'],
                ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0.719, 0.103, '坏瓜']]
        
        self.testSet= ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460] # 待测集
        self.feature_names = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖率'] # 特征
        self.label_names = sorted(list(set([item[-1] for item in dataSet])),reverse=True)
        
        self.countAll = len(list(dataSet))
        self.countG = len(list(item for item in dataSet if (item[-1] == self.label_names[0])))
        self.countB = len(list(item for item in dataSet if (item[-1] == self.label_names[1])))



    # 计算均值、标准差
    def mean_std(self,feature_name, label_name):
        data = [item[self.feature_names.index(feature_name)] for item in self.dataSet if (item[-1] == label_name)]
        mean = round(np.mean(data), 4)
        std = round(np.std(data), 4)

        return mean, std
    
    # 计算离散属性的条件概率P(xi|c)
    def P(self,index, label_name):
        a = [item for item in self.dataSet if (item[-1] == label_name) & (item[index] == self.testSet[index])]
        P = round(len(a) / (self.countG if label_name == self.label_names[0] else self.countB), 4)
        return P


    def fit(self, testSet):

        # 计算先验概率P(c)
        P_G = round(self.countG / self.countAll, 4)
        P_B = round(self.countB / self.countAll, 4)

        # 计算离散属性的条件概率
        P0_G = self.P(0, self.label_names[0])
        P0_B = self.P(0, self.label_names[1])
        P1_G = self.P(1, self.label_names[0])
        P1_B = self.P(1, self.label_names[1])
        P2_G = self.P(2, self.label_names[0])
        P2_B = self.P(2, self.label_names[1])
        P3_G = self.P(3, self.label_names[0])
        P3_B = self.P(3, self.label_names[1])
        P4_G = self.P(4, self.label_names[0])
        P4_B = self.P(4, self.label_names[1])
        P5_G = self.P(5, self.label_names[0])
        P5_B = self.P(5, self.label_names[1])


        denG_mean, denG_std = self.mean_std("密度", self.label_names[0])
        denB_mean, denB_std = self.mean_std("密度", self.label_names[1])
        sugG_mean, sugG_std = self.mean_std("含糖率", self.label_names[0])
        sugB_mean, sugB_std = self.mean_std("含糖率", self.label_names[1]) 

        p_density_G = (1 / (math.sqrt(2 * np.pi)* denG_std)) * np.exp(-1* (((testSet[self.feature_names.index("密度")] - denG_mean)**2) / (2 * (denG_std**2))))
        p_density_B = (1 / (math.sqrt(2 * np.pi)* denB_std)) * np.exp(-1* (((testSet[self.feature_names.index("密度")] - denB_mean)**2) / (2 * (denB_std**2))))

        p_sug_G = (1 / (math.sqrt(2 * np.pi)* sugG_std)) * np.exp(-1* (((testSet[self.feature_names.index("含糖率")] - sugG_mean)**2) / (2 * (sugG_std**2))))
        p_sug_B = (1 / (math.sqrt(2 * np.pi)* sugB_std)) * np.exp(-1* (((testSet[self.feature_names.index("含糖率")] - sugB_mean)**2) / (2 * (sugB_std**2))))

        isGood = P_G * P0_G * P1_G * P2_G * P3_G * P4_G * P5_G * p_density_G * p_sug_G
        isBad = P_B * P0_B * P1_B * P2_B * P3_B * P4_B * P5_B * p_density_B * p_sug_B


        return isGood,isBad


if __name__ == '__main__':
    testSet= ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460] # 待测集
    by = Bayes()
    test_df = pd.DataFrame([testSet], columns=by.feature_names,index=None)
    print("=======================待测样本========================")
    print(f"待测集:\n{test_df}")
    isGood,isBad = by.fit(testSet)
    print("=======================后验概率========================")
    print("后验概率:")
    print(f"P(好瓜|xi) = {isGood}")
    print(f"P(好瓜|xi) = {isBad}")
    print("=======================预测结果========================")
    print("predict ---> 好瓜" if (isGood > isBad) else "predict ---> 坏瓜")

输出结果：

=======================待测样本========================
待测集:
   色泽  根蒂  敲声  纹理  脐部  触感     密度   含糖率
0  青绿  蜷缩  浊响  清晰  凹陷  硬滑  0.697  0.46
=======================后验概率========================
后验概率:
P(好瓜|xi) = 0.04461081360529604
P(好瓜|xi) = 4.350445841831158e-05
=======================预测结果========================
predict ---> 好瓜