原理:
P(c1) > P(c0),则认为发生时间C1;
P(c0) > P(c1),则认为发生时间C0;
将社区言论分为侮辱性发言,和非侮辱性发言,首先样本为postList,分类为classVec,其中,1代表侮辱性发言
def loadDataSet():
postingList = [
['my','dog','has','flea','problem','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','i','love','him'],
['stop','posting','stupid','garbage'],
['mr','licks','age','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']
]
classVec = [0,1,0,1,0,1] #0正常1侮辱性
return postingList,classVec
接着建立训练集的过程,将发言中的词汇统一到一个列表中,去除重复词汇,返回包含所有词汇的vocabulary。
def createVocabList(dataSet): #vector 向量
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet) #将所有的词集中到一个列表中,去重
要将发言文字转化为向量,0代表未出现在词汇表中,1代表出现在词汇表中。
def setOfWords2Vec(vocabList,inputSet): #词汇表(不重复)、输入集
"""
对输入集中的每一个值判断,若存在,就将词汇表中该词位置改为1,否则为零!
将文字集转化为词汇表长度的向量
"""
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print('the word:%s is not in my Vocabulary' % word)
return returnVec
发言处理后,返回如下:
[
[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
[1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
[1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]
]
根据贝叶斯公式:
P(c|w) = P(w|c)P(c)/P(w) = P(w1|c)P(w2|c)···P(wn|c) * P(c)/P(w) 其中w是向量,即某句话为c的概率=(c中该向量的概率*c的概率)/ w的概率 P(w)的概率一样,不计
判断一个发言是否是侮辱性发言,比如['you','stupid','dog']
要计算该发言的两类概率,计算方法如下:
- 由之前的词汇表,将该发言转化为向量
- 根据训练集,计算侮辱性发言中各个词汇出现的概率,以及侮辱性发言占的概率
- 由贝叶斯公式,计算该发言属于两类的概率
注:
一、对公式做了对数处理,目的是避免下溢出
二、初始化为2,目的是避免取0
def trainNB0(trainMatrix,trainCategory): #trainmatrix是由0,1促成的向量,[0,1,0,1,0,1]
"""
计算P(c1)的概率,
由贝叶斯公式计算概率P(c|w) = P(w|c)P(c)/P(w)
"""
numTrainDocs = len(trainMatrix) #即post的长度:6 [[],[],[],[]]
numWords = len(trainMatrix[0]) #即post[0]的长度
pAbusive = sum(trainCategory)/float(numTrainDocs) #脏话的概率c个数/c总数
p0Num = ones(numWords);p1Num = ones(numWords)
p0Denom = 2.0;p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num / p1Denom) #P(w|c1) [ , , , ]的形式
p0Vect = log(p0Num / p0Denom) #P(w|c0)
return p0Vect,p1Vect,pAbusive #脏话类型的概率
下面的p1 = sum(vec2Classify * p1Vec) + log(pClass1)
中注意到:
- 乘法取对数变为加法,
- p1Vec已做过对数处理,
vec2Classify * p1Vec
将训练集中词汇出现的概率向量转变为待判断发言中各词汇出现的概率
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
"""
@vec2Classify:待分类的词条向量,经过setOfWords2Vec()
@p0Vec:P(w|c0)
@p1Vec:P(w|c1)
@pClass1:脏话类型占的比例
"""
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
post,classes = loadDataSet()
vocab = createVocabList(post) #vocab已经包含了dataset的所有值
#print(vocab)
#vec = setOfWords2Vec(vocab,post[0])
trainMat = [] #[[],[],[],[]]
for postinDoc in post:
trainMat.append(setOfWords2Vec(vocab,postinDoc))
p0V,p1V,pAb = trainNB0(trainMat,classes)
testEntry = ['love','my','dalmation']
thisDoc = array(setOfWords2Vec(vocab,testEntry))
print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['my','shit']
thisDoc = array(setOfWords2Vec(vocab,testEntry))
print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
实践:过滤垃圾邮件
目的 任意一封邮件,判断它是否是骚扰邮件
路径:
- 读取骚扰邮件和非骚然邮件,将文本处理为文本列表doclist,并得到标签classlist
- 构建训练集,测试集,训练集就是一个序列,用来循环
- 将文本向量转变为数字向量,形式为
[0,1,0,,0,1,0,0,1]
,代表词汇表中词汇是否出现,并训练标签列表trainclass - 用
trainNB0
函数计算p0V,p1V,pSpam
,其中p1V
代表骚扰邮件中各词汇出现的概率向量 - 用
classifyNB
函数计算当前邮件属于两类邮件的概率
def spamTest():
docList = [];classList = [];fullText = []
for i in range(1,26):
wordList = textParse(open('MachineLearningPractice/ch04_Bayes/email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('MachineLearningPractice/ch04_Bayes/email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet = list(range(50));testSet = []
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = [];trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is:',float(errorCount)/len(testSet))
实践:根据个人广告判断地域倾向
原理: 根据不同地方人发出的广告中用词的区别,判断他是哪个地方的人
路径:
- 根据RSS抽取个人广告,广告为两种类型,ny是纽约的,sf是加利福尼亚州的广告,将其entries中的条目抽出,形式为一条条字符串,用
textparse()
转化为单词列表,形式为[[],[],[]]
- 使用
createvocablist()
去重转化为单词列表vocablist
- 去除最高使用频率最高的30个词汇,目的是降低判断错误率
- 构建训练集和测试集,目的是循环
- 构建训练矩阵trainMat和测试矩阵,
- 将词汇表中词汇转化为向量(词袋模式),并添加到训练矩阵,添加标签
- 用训练矩阵,标签计算不同地区词汇表中词出现的概率
- 用
classifyNB()
计算测试集中两个地区的概率,并比较
def bagOfWord2VectMN(vocabList,inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
def textParse(bigString):
listOfTokens = re.split(r'\W+',bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def localWords(feed1,feed0):
docList=[];classList = [];fullText = []
minLen = min(len(feed1['entries']),len(feed0['entries']))
for i in range(minLen):
summary1 = feed1['entries'][i]['summary']
wordList = textParse(summary1.replace('\n',''))
docList.append(wordList)
fullText.extend(wordList) #extend如果是字符串,会添加字母到列表
classList.append(1)
summary0 = feed0['entries'][i]['summary']
wordList = textParse(summary0.replace('\n',''))
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
#print(len(docList))
vocabList = createVocabList(docList)
# print(vocabList)
top30Words = calMostFreq(vocabList,fullText)
for pairW in top30Words: #{}
if pairW[0] in vocabList:
vocabList.remove(pairW[0]) #移除高频词汇,降低错误率
# trainingSet = range(2*minLen);testSet = []
trainingSet = list(range(2*minLen));testSet = []
for i in range(20):
randIndex = int(random.uniform(0,len(trainingSet))) #[0,50)的随机数
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = [];trainClasses = []
for docIndex in trainingSet: #[0,1, , , ,49]
trainMat.append(bagOfWord2VectMN(vocabList,docList[docIndex])) #doclist是[[],[],[]]
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWord2VectMN(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is:',float(errorCount)/len(testSet))
return vocabList,p0V,p1V
列出两地人使用广告频率高的词汇
def getTopWords(ny,sf):
"""
分析数据:显示地域相关的用词!
"""
import operator
vocabList,p0V,p1V = localWords(ny,sf)
#print(len(p0V))
#print(p1V)
topNY = [];topSF = []
for i in range(len(p0V)):
if p0V[i] > -5.0:
topSF.append((vocabList[i],p0V[i]))
if p1V[i] > -5.0:
topNY.append((vocabList[i],p1V[i]))
sortedSF = sorted(topSF,key=lambda pair: pair[1],reverse=True)
print("SF**SF**SF**SF**SF**SF**SF**SF**")
for item in sortedSF:
print(item[0])
sortedNY = sorted(topNY,key=lambda pair:pair[1],reverse=True) #降序
print("NY**NY**NY**NY**NY**NY**NY**NY**")
for item in sortedNY:
print(item[0])