# -*- coding: utf-8 -*-
"""
Created on Sat Oct 13 13:12:52 2018
@author: fengjuan
"""
#朴素贝叶斯模型有着广泛的实际应用环境,特别是在文本分类处理中
#从sklearn.datasets导入新闻数据加载器
from sklearn.datasets import fetch_20newsgroups
#from sklearn.cross_validation import train_test_split
news=fetch_20newsgroups(subset="all")
print(len(news.data))
print(news.data[0])
'''结果:
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu
I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game. PENS RULE!!!'''
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,
test_size=0.25,random_state=33)
print(y_train.shape)
print(y_test.shape)
'''结果:
(14134,)
(4712,)'''
#from sklearn_extration 导入 StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
vec=CountVectorizer()
X_train=vec.fit_transform(X_train)
X_test=vec.transform(X_test)
#从sklearn.naive_bayes导入贝叶斯模型
from sklearn.naive_bayes import MultinomialNB
#默认配置初始化朴素贝叶斯模型
mnb=MultinomialNB()
mnb.fit(X_train,y_train)
y_predict=mnb.predict(X_test)
from sklearn.metrics import classification_report
print('Accuracy of Naive_Bayes is:',mnb.score(X_test,y_test))
print(classification_report(y_test,y_predict,target_names=news.target_names))
#结果:
'''Accuracy of Naive_Bayes is: 0.8397707979626485
precision recall f1-score support
alt.atheism 0.86 0.86 0.86 201
comp.graphics 0.59 0.86 0.70 250
comp.os.ms-windows.misc 0.89 0.10 0.17 248
comp.sys.ibm.pc.hardware 0.60 0.88 0.72 240
comp.sys.mac.hardware 0.93 0.78 0.85 242
comp.windows.x 0.82 0.84 0.83 263
misc.forsale 0.91 0.70 0.79 257
rec.autos 0.89 0.89 0.89 238
rec.motorcycles 0.98 0.92 0.95 276
rec.sport.baseball 0.98 0.91 0.95 251
rec.sport.hockey 0.93 0.99 0.96 233
sci.crypt 0.86 0.98 0.91 238
sci.electronics 0.85 0.88 0.86 249
sci.med 0.92 0.94 0.93 245
sci.space 0.89 0.96 0.92 221
soc.religion.christian 0.78 0.96 0.86 232
talk.politics.guns 0.88 0.96 0.92 251
talk.politics.mideast 0.90 0.98 0.94 231
talk.politics.misc 0.79 0.89 0.84 188
talk.religion.misc 0.93 0.44 0.60 158
avg / total 0.86 0.84 0.82 4712'''