#测试集数据格式:是否是垃圾邮件,邮件内容
#ham,I will reach ur home in # minutes
#spam,100 dating service cal;l 09064012103 box334sk38ch
#spam,Congratulations U can claim 2 VIP row A Tickets 2 C Blu in concert in November or Blu gift guaranteed Call
#ham,How much r u willing to pay?
#ham,THING R GOOD THANX GOT EXAMS IN MARCH IVE DONE NO REVISION? IS FRAN STILL WITH BOYF? IVE GOTTA INTERVIW 4 EXETER BIT WORRIED!x
#ham,Shall i ask one thing if you dont mistake me.
# #####################################################
# Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
print('*************************\nNaive Bayes\n*************************')
if __name__ == '__main__':
# 读取文本构建语料库
# corpus放的是邮件内容
corpus = []
#labels放的是0 1 (垃圾邮件/正常邮件标识)
labels = []
corpus_test = []
labels_test = []
f = codecs.open("./sms_spam.txt", "rb")
count = 0
while True:
# readline() 方法用于从文件读取整行,包括 "\n" 字符。
line = f.readline().decode("utf-8")
# 读取第一行,第一行数据是列头,不统计
if count == 0:
count = count + 1
continue
if line:
count = count + 1
line = line.split(",")
label = line[0]
sentence = line[1]
corpus.append(sentence)
if "ham" == label:
labels.append(0)
elif "spam" == label:
labels.append(1)
# 把5550行后的数据作为测试集
if count > 5550:
corpus_test.append(sentence)
if "ham" == label:
labels_test.append(0)
elif "spam" == label:
labels_test.append(1)
else:
break
# 文本特征提取:
# 将文本数据转化成特征向量的过程
# 比较常用的文本特征表示法为词袋法
#
# 词袋法:
# 不考虑词语出现的顺序,每个出现过的词汇单独作为一列特征
# 这些不重复的特征词汇集合为词表
# 每一个文本都可以在很长的词表上统计出一个很多列的特征向量
# CountVectorizer是将文本向量转换成稀疏表示数值向量(字符频率向量) vectorizer 将文档词块化,只考虑词汇在文本中出现的频率
# 词袋
vectorizer = CountVectorizer()
# 每行的词向量,fea_train是一个矩阵, 就是将所有邮件切分单词,去重,然后每个单词转成一个ASCI码,然后组成一个向量,比如有1000个单词[]
fea_train = vectorizer.fit_transform(corpus)
print("vectorizer.get_feature_names is ", vectorizer.get_feature_names())
print("fea_train is ", fea_train.toarray())
# vocabulary=vectorizer.vocabulary_ 只计算上面vectorizer中单词的tf(term frequency 词频)
vectorizer2 = CountVectorizer(vocabulary=vectorizer.vocabulary_)
fea_test = vectorizer2.fit_transform(corpus_test)
# print vectorizer2.get_feature_names()
# print fea_test.toarray()
# create the Multinomial Naive Bayesian Classifier
# alpha = 1 拉普拉斯估计给每个单词个数加1
clf = MultinomialNB(alpha=1)
clf.fit(fea_train, labels)
pred = clf.predict(fea_test);
for p in pred:
if p == 0:
print("正常邮件")
else:
print("垃圾邮件")