朴素贝叶斯原理及代码

一、简介

朴素贝叶斯(naive Bayes)算法是基于贝叶斯定理与特征条件独立假设的分类方法。对于给定的训练数据集,首先基于特征条件独立假设学习输入输出的联合概率分布。然后基于此模型,对给定的输入x,利用贝叶斯定理求出后验概率最大的输出y。不同于其他分类器,朴素贝叶斯是一种基于概率理论的分类算法;总体来说,朴素贝叶斯原理和实现都比较简单,学习和预测效率较高,是一种经典而常用的分类算法。其中的朴素(naive)是指的对于数据集中的各个特征(feature)都有较强的独立性假设,并未将特征之间的相关性考虑其中。

二、原理

设有样本数据集D = {d1 ,d2,···,dn },对应样本集的特征属性集为X={x1,x2,···,xd},类变量为Y={y1,y2,···,ym},即D可以分成ym类别。其中x1,x2,···,xd相互随机独立,则Y的先验概率Pprior = P(Y),Y的后验概率Ppost = P(Y|X),则由朴素贝叶斯算法可得,后验概率可以由先验概率Pprior = P(Y)、证据P(X)、类条件概率P(X|Y)计算出:
[图片]

朴素贝叶斯基于各特征相互独立,在给定类别为y的情况下,上式可以进一步表示为下式:
[图片]

由以上两式可以算出后验概率为:
[图片]

由于P(X)的大小是固定不变的,因此在比较后验概率时,只比较上式的分子部分即可。
因此可以得到一个样本数据属于类别yi的朴素贝叶斯计算公式如下:
[图片]

三、原理分析

这里用一个实际案例来说明比较抽象的公式。
以瓜树的案例来计算说明:
目标:对下例进行分类
[图片]

计算过程:
[图片]

[图片]

先在训练集中计算标签为0和1的先验概率,再分别计算每个特征在标签为0和1的条件概率。
预测过程中,将预测数据每个特征在标签为1时的条件概率之积与预测为1的概率直接相乘,记为P(1)
再将预测数据每个特征在标签为0时的条件概率之积与预测为0的概率直接相乘,记作P(0)
比较P(1)与P(0),哪个大就预测数据的标签为0或1.

四、代码

import csv
import numpy as np
import matplotlib.pyplot as plt

# 0PassengerId:乘客的ID  
# 1Survived:乘客是否获救,Key:0=没获救,1=已获救
# 2Pclass:乘客船舱等级(1/2/3三个等级舱位)
# 3Name:乘客姓名                                       
# 4Sex:性别
# 5Age:年龄
# 6SibSp:乘客在船上的兄弟姐妹/配偶数量
# 7Parch:乘客在船上的父母/孩子数量
# 8Ticket:船票号                                         
# 9Fare:船票价
# 10Cabin:客舱号码                                        
# 11Embarked:登船的港口                                   
# 数据分析得: 年龄中位数28,缺失值补充为28。并且以25和31为界限分为三类
# sibsp&parch按照有无分为两类
# 生还共计342人。其中全体票价和生还票价均值均约为32。
# 生还者票价高于32的126人
# 死亡共计549人。其中票价低于32的464人
# 票价低于32共计680人,死亡率0.68
# 票价低于64的共计773人,死亡512人  选择以64为分界点def loadDataset(filename):with open(filename, 'r') as f:
lines = csv.reader(f)
data_set = list(lines)
if filename != 'titanic.csv':
    for i in range(len(data_set)):
        del(data_set[i][0])
# 整理数据for i in range(len(data_set)):
    del(data_set[i][0])
    del(data_set[i][2])
    data_set[i][4] += data_set[i][5]
    del(data_set[i][5])
    del(data_set[i][5])
    del(data_set[i][6])
    del(data_set[i][-1])

category = data_set[0]

del (data_set[0])
# 转换数据格式for data in data_set:
    data[0] = int(data[0])
    data[1] = int(data[1])
    if data[3] != '':
        data[3] = float(data[3])
    else:
        data[3] = None
    data[4] = float(data[4])
    data[5] = float(data[5])
# 补全缺失值 转换记录方式 分类for data in data_set:
    if data[3] is None:
        data[3] = 28# male : 1, female : 0if data[2] == 'male':
        data[2] = 1else:
        data[2] = 0# age <25 为0, 25<=age<31为1,age>=31为2if data[3] < 25:
        data[3] = 0elif data[3] >= 21 and data[3] < 60: # 但是测试得60分界准确率最高???!!!
        data[3] = 1else:
        data[3] = 2# sibsp&parcg以2为界限,小于为0,大于为1if data[4] < 2:
        data[4] = 0else:
        data[4] = 1# fare以64为界限if data[-1] < 64:
        data[-1] = 0else:
        data[-1] = 1return data_set, category


class NaiveBayes:def __init__(self):passdef train(self, data):
        length = len(data)
        p = []
        for i in range(len(data[0])):
            p.append([])
        # 计算先验概率
        sur = 0for i in data:
            if i[0] == 1:
                sur += 1

        death = length - sur
        p1 = sur/length
        p0 = 1 - p1
        p[0].append(p1)
        p[0].append(p0)

        print(p)

        # 计算pclass的条件概率
        a1 = 0
        a0 = 0
        b1 = 0
        b0 = 0
        c1 = 0
        c0 = 0for i in data:
            if i[0] == 1 and i[1] == 1:
                a1 += 1elif i[0] == 0 and i[1] == 1:
                a0 += 1elif i[0] == 1 and i[1] == 2:
                b1 += 1elif i[0] == 0 and i[1] == 2:
                b0 += 1elif i[0] == 1 and i[1] == 3:
                c1 += 1elif i[0] == 0 and i[1] == 3:
                c0 += 1
        p[1].append(a1 / sur)
        p[1].append(a0 / death)
        p[1].append(b1 / sur)
        p[1].append(b0 / death)
        p[1].append(c1 / sur)
        p[1].append(c0 / death)

        # 计算sex的条件概率
        m1 = 0
        m0 = 0
        f1 = 0
        f0 = 0for i in data:
            if i[0] == 1 and i[2] == 1:
                m1 += 1elif i[0] == 0 and i[2] == 1:
                m0 += 1elif i[0] == 1 and i[2] == 0:
                f1 += 1elif i[0] == 0 and i[2] == 0:
                f0 += 1
        p[2].append(m1 / sur)
        p[2].append(m0 / death)
        p[2].append(f1 / sur)
        p[2].append(f0 / death)

        # 计算age的条件概率
        a1 = 0
        a0 = 0
        b1 = 0
        b0 = 0
        c1 = 0
        c0 = 0for i in data:
            if i[0] == 1 and i[3] == 1:
                a1 += 1elif i[0] == 0 and i[3] == 1:
                a0 += 1elif i[0] == 1 and i[3] == 2:
                b1 += 1elif i[0] == 0 and i[3] == 2:
                b0 += 1elif i[0] == 1 and i[3] == 3:
                c1 += 1elif i[0] == 0 and i[3] == 3:
                c0 += 1
        p[3].append(a1 / sur)
        p[3].append(a0 / death)
        p[3].append(b1 / sur)
        p[3].append(b0 / death)
        p[3].append(c1 / sur)
        p[3].append(c0 / death)

        # 计算sibsp&parch条件概率
        m1 = 0
        m0 = 0
        f1 = 0
        f0 = 0for i in data:
            if i[0] == 1 and i[4] == 1:
                m1 += 1elif i[0] == 0 and i[4] == 1:
                m0 += 1elif i[0] == 1 and i[4] == 0:
                f1 += 1elif i[0] == 0 and i[4] == 0:
                f0 += 1
        p[4].append(m1 / sur)
        p[4].append(m0 / death)
        p[4].append(f1 / sur)
        p[4].append(f0 / death)

        m1 = 0
        m0 = 0
        f1 = 0
        f0 = 0for i in data:
            if i[0] == 1 and i[5] == 1:
                m1 += 1elif i[0] == 0 and i[5] == 1:
                m0 += 1elif i[0] == 1 and i[5] == 0:
                f1 += 1elif i[0] == 0 and i[5] == 0:
                f0 += 1
        p[5].append(m1 / sur)
        p[5].append(m0 / death)
        p[5].append(f1 / sur)
        p[5].append(f0 / death)

        return p

    def predict(self, t_data, p):
        result = []
        pp = []
        for data in t_data:
            p1 = p[0][0]
            p0 = p[0][1]
            # pclassif data[1] == 1:
                p1 *= p[1][0]
                p0 *= p[1][1]
            elif data[1] == 2:
                p1 *= p[1][2]
                p0 *= p[1][3]
            elif data[1] == 3:
                p1 *= p[1][4]
                p0 *= p[1][5]
            # sexif data[2] == 1:
                p1 *= p[2][0]
                p0 *= p[2][1]
            else:
                p1 *= p[2][2]
                p0 *= p[2][3]

            # ageif data[3] == 0:
                p1 *= p[3][0]
                p0 *= p[3][1]
            elif data[3] == 1:
                p1 *= p[3][2]
                p0 *= p[3][3]
            else:
                p1 *= p[3][4]
                p0 *= p[3][5]

            #sibsp&parchif data[4] == 1:
                p1 *= p[4][0]
                p0 *= p[4][1]
            else:
                p1 *= p[4][2]
                p0 *= p[4][3]

            # fareif data[5] == 1:
                p1 *= p[5][0]
                p0 *= p[5][1]
            else:
                p1 *= p[5][2]
                p0 *= p[5][3]

            pp.append(p1)

            if p1 >= p0:
                result.append(1)
            else:
                result.append(0)
        return pp, result


def roc_auc(p, test, result):
    tp = 0
    tn = 0for i in test:
        if i[0] == 1:
            tp += 1else:
            tn += 1for i in range(len(test)):
        test[i] = test[i][0]
        p[i] = [p[i]]
        p[i].append(test[i])
        p[i].append(result[i])

    p = sorted(p, reverse=True)
    print(p)
    tpr = [0]
    fpr = [0]
    length = len(p)

    for i in range(1, length):
        if p[i][1] == 1:
            fpr.append(fpr[i-1])
            tpr.append(tpr[i-1]+1/tp)

        elif p[i][1] == 0:
            fpr.append(fpr[i-1]+1/tn)
            tpr.append(tpr[i-1])

    tpr.append(1)
    fpr.append(1)

    print(fpr)
    print(tpr)

    auc = 0for i in range(len(fpr)-1):
        auc += (fpr[i+1]-fpr[i])*(tpr[i]+tpr[i+1])
    auc = auc/2print(auc)

    fpr = np.array(fpr)
    tpr = np.array(tpr)

    plt.title("ROC curve of %s (AUC = %.4f)" % ('svm', auc))
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

    plt.plot(fpr, tpr)
    plt.show()


if __name__ == "__main__":
    # 以下是最初的数据分析# train_set, category = loadDataset('titanic.csv')# print(category)# print(train_set)# k = [0, 0, 0, 0, 0, 0]# min = train_set[0][3]# max = 0# age = []# for data in train_set:#     for i in range(len(category)):#         if data[i] is None:#             k[i] += 1#     if data[3] is not None:#         if data[3] < min:#             min = data[3]#         if data[3] > max:#             max = data[3]#         age.append(data[3])## age = sorted(age)## print(category)# print(k)# print(min)# print(max)# print(age)# print((len(train_set)-k[3])/2)  #357# print(age[int((len(train_set)-k[3])/2)])    # 28# print(len(age)/3)# print(age[297])     # 25# print(age[297*2])   # 31# for data in train_set:#     if data[3] is None:#         data[3] = 28# print(train_set)# s = 0# for data in train_set:#     if data[-1] < 64:#         s += 1## print(s)# 以上是最初的数据分析
    train_set, category = loadDataset('titanic_train.csv')
    test_set, category = loadDataset('titanic_test.csv')

    print(category)
    print(train_set)

    bayes = NaiveBayes()

    p = bayes.train(train_set)
    print(p)

    pp, result = bayes.predict(test_set, p)
    print(result)
    count = 0for i in range(len(result)):
        if test_set[i][0] == result[i]:
            count += 1

    accuracy = count/len(test_set)
    print(accuracy)



    # # 十折交叉验证# data_set, category = loadDataset('titanic.csv')## length = len(data_set) // 10# print(length)# data = []# for i in range(10):#     data.append(data_set[i * length:(i + 1) * length])# data.append(data_set[10 * length:])## sum = 0## for i in range(9):#     p = bayes.train(data[i])#     print(p)##     pp, result = bayes.predict(data[9], p)#     print(result)#     count = 0#     for i in range(len(result)):#         if test_set[i][0] == result[i]:#             count += 1#     sum += count/len(data[9])## print(sum/10)# 自助法
    data_set, category = loadDataset('titanic.csv')
    bootstrapping = []
    for i in range(len(data_set)):
         bootstrapping.append(np.floor(np.random.random()*len(data_set)))

    test = []
    for i in range(len(data_set)):
         test.append(data_set[int(bootstrapping[i])])

    p = bayes.train(data_set)
    print(p)

    pp, result = bayes.predict(test, p)
    print(result)
    count = 0for i in range(len(result)):
        if test[i][0] == result[i]:
            count += 1

    accuracy = count / len(test)
    print(accuracy)

    #基于自助法计算F1-score
    tp = 0
    tn = 0for i in range(len(result)):
        if test[i][0] == result[i] == 1:
            tp += 1if test[i][0] == result[i] == 0:
            tn += 1print(tp)
    print(tn)

    f1 = (2*tp)/(len(test)+tp-tn)
    print(f1)

    roc_auc(pp, test, result)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值