朴素贝叶斯
条件概率
P(A|B)=P(B|A)*P(A)/P(B)
scikit-learn在sklearn.naive_bayes模块提供三种朴素贝叶斯类:
- 多项贝叶斯
- 二项贝叶斯
- 高斯贝叶斯
示例:预测文本分类
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
Bernoulli = BernoulliNB(alpha=0.01)
Multinomial = MultinomialNB(alpha=0.01)
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
multinomial_hashing_trick = HashingVectorizer(stop_words='english', binary=False, norm=None, non_negative=True)
binary_hashing_trick = HashingVectorizer(stop_words='english', binary=True, norm=None, non_negative=True)
Multinomial.fit(multinomial_hashing_trick.transform(newsgroups_train.data),newsgroups_train.target)
Bernoulli.fit(binary_hashing_trick.transform(newsgroups_train.data),newsgroups_train.target)
from sklearn.metrics import accuracy_score
for m,h in [(Bernoulli,binary_hashing_trick), (Multinomial,multinomial_hashing_trick)]:
print ('Accuracy for %s: %.3f' % (m, accuracy_score(y_true=newsgroups_test.target, y_pred=m.predict(h.transform(newsgroups_test.data)))))
高斯贝叶斯:
from sklearn.datasets import load_boston
boston = load_boston()
from sklearn.naive_bayes import GaussianNB
Gaussian = GaussianNB()
y_ord = pd.cut(boston.target, bins=4, labels=False)
Gaussian.fit(boston.data,y_ord)
print (np.corrcoef(Gaussian.predict(boston.data),boston.target)[0,1])