1、原理是贝叶斯定理,利用已知条件概率,求逆条件概率。
2、需要数值型或者布尔型数据
#创建词汇表
def createVocab(dataset):
vocabset = set([])
for sets in dataset:
vocabset = vocabset | set(sets)
return vocabset
#创建文档向量
def createwordsvec(vocabset, inputset):
returnvec = [0] * len(vocabset)
for i in inputset:
if i in vocabset:
returnvec[vocabset.index(i)] = 1
else:
print('the word not in vocab. ')
return returnvec
材料准备完毕,创建分类器训练函数:
def trainNB(trainmatrix, traincategory):
numtraindoc = len(trainmatrix)
eachlinewords = len(trainmatrix[0])
probofbad = sum(traincategory) / float(numtraindoc)
p0 = zeros(eachlinewords); p1 = zeros(eachlinewords)
p0sum = 0.0; p1sum = 0.0
#矩阵加减乘除的简单运算,numpy的东西
for i in range(numtraindoc):
if traincategory[i] == 1:
p1 += trainmatrix[i]
p1sum += sum(trainmatrix[i])
else:
p0 += trainmatrix[i]
p0sum += sum(trainmatrix[i])
prob1 = p1 / p1sum
prob0 = p0 / p0sum
return prob1, prob0, probofbad
存在一个问题,当通过计算概率的乘积判断文档属于哪一分类,若其中一个值为0,则导致结果为0,为了避免这种影响,可将初始值设置为1,分母初始为2;另外一个问题是,由于有很多个多位小数点的数字相乘,结果中的小数点有N多位,程序可能得不到正确答案,惯例是取自然对数,方便计算。
p0 = ones(eachlinewords); p1 = ones(eachlinewords)
p0sum = 2.0; p1sum = 2.0
接着创建贝叶斯分类器:
def classifyNB(vec2classify, prob1, prob0, probofbad):
p1 = sum(vec2classify * prob1) + log(probofbad)
p0 = sum(vec2classify * prob0) + log(1.0 - probofbad)
if p1 > p0:
return 1
else:
return 0
以上在计算概率的时候,只计算了词汇在表中有无出现,并未计算累加次数,书上称为词集模式,当然在有些时候我们是要按照词汇出现频次计算的概率,可以反馈出某些词汇带来的不同信息,像是信贷中常用到的运营商数据,其中的某个标签若出现的概率极高,则可以反映出用户某一方面的行为,对于之前创建的向量表函数稍作改动:
def createwordsvec(vocabset, inputset):
returnvec = [0] * len(vocabset)
for i in inputset:
if i in vocabset:
returnvec[vocabset.index(i)] += 1
else:
print('the word not in vocab. ')
return returnvec
在读取文本后,经常有一些乱七八糟的符号,所以先要进行文本解析,正则表达式刚好用得上,书上的栗子是这样的:
def textparse(bigstring):
import re
listoftokens = re.split(r'\w*', bigstring)
return [tok.lower() for tok in listoftokens if len(tok) > 2]
def spamtest():
doclist = []; classlist = []; fulltext = []
# 源文件可能是有26个文本
for i in range(26):
wordlist = textparse(open('email/spam/%d.txt %(i)).read())
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(1)
wordlist = textparse(open(r'email/ham/%d.txt %(i)).read())
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(0)
vocablist = createvocablist(doclist)
trainset = range(50); testset = []
# 去留法,2-8分成测试样本和训练样本
for i in range(10):
randindex = int(random.uniform(0,len(trainset)))
testset.append(randindex)
del(trainset[randindex])
trainmat = []; trainclass = []
# trainset40个,doclist52个
for docindex in trainset:
trainmat.append(createwordsvec(vocablist, doclist[docindex])
trainclass.append(classlist[docindex])
prob0, prob1, probofbad = trainNB(array(trainmat), array(trainclass))
errorcount = 0.0
for docindex in testset:
wordvec = createwordsvec(vocablist, doclist[docindex])
if classifyNB(wordvec, prob1, prob0, porbofbad) != classlist[docindex]
errorcount += 1
print('the error rate is :' float(errorcount) / len(testset))