信息内容安全-朴素贝叶斯邮件过滤(多项式模型)实验
代码
import os
import jieba
import random
from collections import Counter
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import confusion_matrix
def get_email():
global email_list
global email_num
for root, dirs, files in os.walk(ham_path):
for fn in files:
email_list.append([fn,1])
email_num += 1
for root, dirs, files in os.walk(spam_path):
for fn in files:
email_list.append([fn,0])
email_num += 1
def get_training_set(email_list):
global training_set
global test_set
global ham_training_num
global spam_training_num
global email_training_num
training_set = email_list[:]
email_training_num = email_num
for i in range(10):
s = int( random.uniform(0, len(training_set)) )
if training_set[s][1] == 1:
ham_training_num -= 1
else:
spam_training_num -= 1
email_training_num -= 1
test_set.append(training_set[s])
del (training_set[s])
print("测试集:")
print(test_set)
def get_priori_possibility():
global py1
global py2
py1 = round( (ham_training_num + a) / (email_training_num + 2 * a), 2)
py2 = round( (spam_training_num + a) / (email_training_num +