1原理
朴素贝叶斯
贝叶斯:根据贝叶斯定理p(y|x) = p(y)p(x|y)/p(x).选择p(y|x) 最大的类别作为x的类别。可知朴素贝叶斯是监督学习的生成模型(由联合概率分布得到概率分布)。选择p(y|x) 最大的类别时,分母相同,所以简化为比较 p(y)p(x|y)的大小。
朴素: 计算p(x|y)的概率,假设x是n维向量,每维向量有sn个取值可能,则就要计算类别*(sn的n次方)次。过于复杂。因此假设样本的特征之间相互独立,所以叫朴素。则p(x|y) = p(xi|y)的乘积,i=1,2,n.
为了防止p(xi|y)连乘时初恋某个数为0,通常选择一个拉普拉斯平滑。
2 三要素
模型:p(y|x) = p(y)p(x|y)/p(x),近似=p(y)p(x|y), =p(y)p(xi|y),i=1,2,n
策略:01损失函数
算法:极大似然估计,贝叶斯估计(加拉普拉斯平滑)
应用
假设特征相互独立,常用于垃圾邮件过滤,文本分类等词袋模型。
python实现
class NB(object):
def __init__(self):
#每个特征属性的取值列表
self.x = []
#标签列表
self.y = []
#标签概率列表
self.py = defaultdict(float)
#标签概率列表
self.pxy = defaultdict(lambda:defaultdict(list))
#分阶数
self.n = 5
def preProcess(self,X):
for i in range(X.shape[1]):
X[:,i] = self.step(X[:,i],self.n)
return X
#分阶函数,因为不同特征的数值集大小相差巨大,造成部分概率矩阵变得稀疏,例如某个特征取值[0.2,0.4,0.6,1.2,1.4,1.6,2.2,2.4,2.6],如果分三阶则0.2,0.4,0.6时,为1,1.2,1.4,1.6为2
def step(self,x,n):
ma = np.max(x)
mi = np.min(x)
stepN = (ma-mi)/float(n)
for i in range(len(x)):
for j in range(n):
a = mi + j*stepN
b = mi + (j+1)*stepN
if x[i]>= a and x[i] <= b:
x[i] = j + 1
break
return x
def prob(self,element,arr):
return sum(arr==element)/float(len(arr))
def fit(self,X,y):
X = self.preProcess(X)
self.y = list(set(y))
for i in range(X.shape[1]):
self.x.append(list(set(X[:,i])))
for yi in self.y:
# print(yi)
self.py[yi] = self.prob(yi,self.y)
for i in range(X.shape[1]):
# print(i)
samples = X[y==yi,i]
self.pxy[yi][i] = [self.prob(xi,samples) for xi in self.x[i]]
def predict_one(self,test_x):
print(test_x)
prob = 0
label = self.y[0]
for yi in self.y:
tempProb = np.log(self.py[yi])
for i in range(len(test_x)):
tempProb += np.log(self.pxy[yi][i][self.x[i].index(test_x[i])])
if tempProb>prob:
prob = tempProb
label = yi
return label
def predict(self,samples):
ylabels = []
samples = self.preProcess(samples)
for i in range(samples.shape[0]):
label = self.predict_one(test_x[i,:])
ylabels.append(label)
return np.array(ylabels)
def score(self,test_x,y):
ylabels = self.predict(test_x)
num = 0
for i in range(len(ylabels)):
if ylabels[i] == y[i]:
num += 1
return num/float(len(test_x))
测试
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import defaultdict
X,y = load_iris(return_X_y=True)
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size=0.3)
nb = NB()
nb.fit(train_x,train_y)
nb.score(test_x,test_y)
由于iris更实用与高斯朴素贝叶斯估计,所以精确率不高
朴素贝叶斯用于垃圾邮件过滤时,使用多项式朴素贝叶斯,相当于只有一个特征,特征可以取值是所有文本组成的词汇表。
python实现
import numpy as np
from collections import defaultdict
class NBClassify(object):
#构建包含所有词的词表
def createVocab(self,trainData):
vocabSet = set([])
for document in trainData:
vocabSet = vocabSet | set(document)
return list(vocabSet)
#将训练样本数据转化为词表向量
def bagOfVocab(self,vocabLst,inputData):
inputVocab = np.zeros(len(vocabLst))
for one in inputData:
if one in vocabLst:
inputVocab[vocabLst.index(one)] += 1
return inputVocab
def fit(self,X,y):
self.vocabLst = self.createVocab(X)
#标签先验概率
self.py = defaultdict(float)
#标签,特征,条件概率,初始化特征为1
self.pxy = defaultdict(lambda :np.ones(len(self.vocabLst)))
dataVactor = []
for oneDocument in X:
documentVactor = self.bagOfVocab(self.vocabLst,oneDocument)
dataVactor.append(documentVactor)
dataMatrix = np.array(dataVactor)
for i in range(len(y)):
label = y[i]
self.py[label] += 1
self.pxy[label] += dataMatrix[i]
for key in self.py.keys():
self.py[key] = np.log(self.py[key]/dataMatrix.shape[0])
for key in self.pxy.keys():
self.pxy[key] = np.log(self.pxy[key]/np.sum(self.pxy[key]))
def predictOne(self,testData):
testDataVector = self.bagOfVocab(self.vocabLst,testData)
maxProb = -np.inf
label = 0
for key,value in self.pxy.items():
tempProb = self.py[key] + np.sum(value*testDataVector)
if tempProb>maxProb:
maxProb = tempProb
label = key
return label
测试
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
nb = NBClassify()
nb.fit(postingList,classVec)
testEntry = ['love', 'my', 'dalmation']
nb.predictOne(testEntry)
testEntry1 = ['stupid', 'garbage']
nb.predictOne(testEntry1)
sklearn实现
sklearn.naive_bayes包含三种贝叶斯分类器。高斯朴素贝叶斯(GaussianNB),多项式朴素贝叶斯(MultinomialNB),伯努利朴素贝叶斯(BernoulliNB )
1 GaussianNB:适用于特征是连续变量。条件概率分布p(x|y)符合高斯分布。例如load_iris。
均值和方差,从y=ck(某一类)且xi=xi的数据中求得。
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
X,y = load_iris(return_X_y=True)
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size=0.3)
gnb = GaussianNB()
gnb.fit(train_x,train_y)#训练
gnb.predict(test_x)#预测
gnb.score(test_x,test_y)#准确率
2 MultinomialNB:适用于特征是离散变量,条件概率分布p(x|y)符合多项式分布。例如上例中的垃圾邮件过滤。
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
newsdata = fetch_20newsgroups(subset='all')
cv = CountVectorizer()
newsdataVector = cv.fit_transform(newsdata.data)
train_x,test_x,train_y,test_y = train_test_split(newsdataVector,newsdata.target,test_size=0.3)
mnb = MultinomialNB()
mnb.fit(train_x,train_y)
mnb.predict(test_x)
mnb.score(test_x,test_y)
3 BernoulliNB :适用于特征是离散变量,条件概率分布p(x|y)符合伯努利分布。即特征取值只有0,1.
垃圾邮件过滤中相当于文档向量不是词出现的词数,而是是否出现0或1.