机器学习算法完整版见fenghaootong-github
垃圾邮件分类
在DATA/email/spam文件夹中有25封垃圾邮件,在DATA/email/ham中有25封正常邮件,将其进行垃圾邮件分类。
导入需要的库
from numpy import *
import re
import warnings
warnings.filterwarnings('ignore')
分词
将邮件内容划分成一个个单词的形式
def textParse(bigString):
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
re.split(r'\W*', bigString)
, 表示除了数字、字母和下划线的符号进行划分,return是一个列表推到生成的列表,单词长度小于等于2的过率掉,并且将其变成小写字母
textParse(open('../DATA/email/spam/1.txt').read())
['codeine',
'15mg',
'for',
'203',
'visa',
'only',
'codeine',
'methylmorphine',
'narcotic',
'opioid',
'pain',
'reliever',
'have',
'15mg',
'30mg',
'pills',
'15mg',
'for',
'203',
'15mg',
'for',
'385',
'15mg',
'for',
'562',
'visa',
'only']
textParse(open('../DATA/email/ham/1.txt').read())
['peter',
'with',
'jose',
'out',
'town',
'you',
'want',
'meet',
'once',
'while',
'keep',
'things',
'going',
'and',
'some',
'interesting',
'stuff',
'let',
'know',
'eugene']
生成词汇表
将所有的邮件进行分词后生成一个dataset,然后生成一个词汇表,这个词汇表是一个集合,每个单词出现一次
def createVocabList(dataSet):
vocabSet = set([])
for docment in dataSet:
vocabSet = vocabSet | set(docment)
return list(vocabSet)
doc_list=[]
word_list = textParse(open('../DATA/email/spam/1.txt').read())
doc_list.append(word_list)
createVocabList(doc_list)
['reliever',
'pills',
'562',
'pain',
'only',
'15mg',
'methylmorphine',
'for',
'codeine',
'30mg',
'have',
'narcotic',
'visa',
'opioid',
'385',
'203']
生成词向量
每一封邮件都存在了词汇表中,因此可以将每一封邮件生成一个词向量,存在几个则为几,不存在的为零,如上面15mg
的词向量为[0,1,0,...]
def bagOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
else:
print("the word is not in my vocabulry")
return returnVec
bagOfWords2Vec(createVocabList(doc_list),doc_list[0])
[1, 1, 1, 1, 2, 5, 1, 4, 2, 1, 1, 1, 2, 1, 1, 2]
训练算法
伪代码如下:
计算每个类别中的文档数目
对每篇训练文档:
对每个类别:
如果词条出现在文档中——>增加该词条的计数值
增加所有词条的计数值
对每个类别:
对每个词条:
将该词条的数目除以总词条数目得到条件概率
返回每个类别的条件概率
#这里的trainMat是训练样本的词向量,其是一个矩阵,他的每一行为一个邮件的词向量
#trainGategory为与trainMat对应的类别,值为0,1表示正常,垃圾
def train(trainMat, trainGategory):
numTrain = len(trainMat)
numWords = len(trainMat[0]) #is vocabulry length
pAbusive = sum(trainGategory)/float(numTrain)
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
for i in range(numTrain):
if trainGategory[i] == 1:
p1Num += trainMat[i] #统计类1中每个单词的个数
p1Denom += sum(trainMat[i]) #类1的单词总数
else:
p0Num += trainMat[i]
p0Denom += sum(trainMat[i])
p1Vec = log(p1Num/p1Denom) #类1中每个单词的概率
p0Vec =log(p0Num/p0Denom)
return p0Vec, p1Vec, pAbusive
处理数据验证过程
#spam email classfy
def spamTest():
fullTest=[];docList=[];classList=[]
for i in range(1,26): #it only 25 doc in every class
wordList=textParse(open('../DATA/email/spam/%d.txt' % i).read())
docList.append(wordList)
fullTest.extend(wordList)
classList.append(1)
wordList=textParse(open('../DATA/email/ham/%d.txt' % i).read())
docList.append(wordList)
fullTest.extend(wordList)
classList.append(0)
vocabList=createVocabList(docList) # create vocabulry
trainSet=list(range(50));testSet=[]
#choose 10 sample to test ,it index of trainMat
for i in range(10):
randIndex=int(random.uniform(0,len(trainSet)))#num in 0-49
testSet.append(trainSet[randIndex])
del(trainSet[randIndex])
trainMat=[];trainClass=[]
for docIndex in trainSet:
trainMat.append(bagOfWords2Vec(vocabList,docList[docIndex]))
trainClass.append(classList[docIndex])
p0,p1,pSpam=train(array(trainMat),array(trainClass))
errCount=0
for docIndex in testSet:
wordVec=bagOfWords2Vec(vocabList,docList[docIndex])
if classfy(array(wordVec),p0,p1,pSpam) != classList[docIndex]:
errCount +=1
print(("classfication error"), docList[docIndex])
print(("the error rate is ") , float(errCount)/len(testSet))
# classfy funtion
def classfy(vec2classfy,p0Vec,p1Vec,pClass1):
p1=sum(vec2classfy*p1Vec)+log(pClass1)
p0=sum(vec2classfy*p0Vec)+log(1-pClass1)
if p1 > p0:
return 1;
else:
return 0
for i in range(10):
spamTest()
classfication error ['scifinance', 'now', 'automatically', 'generates', 'gpu', 'enabled', 'pricing', 'risk', 'model', 'source', 'code', 'that', 'runs', '300x', 'faster', 'than', 'serial', 'code', 'using', 'new', 'nvidia', 'fermi', 'class', 'tesla', 'series', 'gpu', 'scifinance', 'derivatives', 'pricing', 'and', 'risk', 'model', 'development', 'tool', 'that', 'automatically', 'generates', 'and', 'gpu', 'enabled', 'source', 'code', 'from', 'concise', 'high', 'level', 'model', 'specifications', 'parallel', 'computing', 'cuda', 'programming', 'expertise', 'required', 'scifinance', 'automatic', 'gpu', 'enabled', 'monte', 'carlo', 'pricing', 'model', 'source', 'code', 'generation', 'capabilities', 'have', 'been', 'significantly', 'extended', 'the', 'latest', 'release', 'this', 'includes']
the error rate is 0.1
classfication error ['scifinance', 'now', 'automatically', 'generates', 'gpu', 'enabled', 'pricing', 'risk', 'model', 'source', 'code', 'that', 'runs', '300x', 'faster', 'than', 'serial', 'code', 'using', 'new', 'nvidia', 'fermi', 'class', 'tesla', 'series', 'gpu', 'scifinance', 'derivatives', 'pricing', 'and', 'risk', 'model', 'development', 'tool', 'that', 'automatically', 'generates', 'and', 'gpu', 'enabled', 'source', 'code', 'from', 'concise', 'high', 'level', 'model', 'specifications', 'parallel', 'computing', 'cuda', 'programming', 'expertise', 'required', 'scifinance', 'automatic', 'gpu', 'enabled', 'monte', 'carlo', 'pricing', 'model', 'source', 'code', 'generation', 'capabilities', 'have', 'been', 'significantly', 'extended', 'the', 'latest', 'release', 'this', 'includes']
the error rate is 0.1
the error rate is 0.0
the error rate is 0.0
classfication error ['experience', 'with', 'biggerpenis', 'today', 'grow', 'inches', 'more', 'the', 'safest', 'most', 'effective', 'methods', 'of_penisen1argement', 'save', 'your', 'time', 'and', 'money', 'bettererections', 'with', 'effective', 'ma1eenhancement', 'products', 'ma1eenhancement', 'supplement', 'trusted', 'millions', 'buy', 'today']
the error rate is 0.1
classfication error ['benoit', 'mandelbrot', '1924', '2010', 'benoit', 'mandelbrot', '1924', '2010', 'wilmott', 'team', 'benoit', 'mandelbrot', 'the', 'mathematician', 'the', 'father', 'fractal', 'mathematics', 'and', 'advocate', 'more', 'sophisticated', 'modelling', 'quantitative', 'finance', 'died', '14th', 'october', '2010', 'aged', 'wilmott', 'magazine', 'has', 'often', 'featured', 'mandelbrot', 'his', 'ideas', 'and', 'the', 'work', 'others', 'inspired', 'his', 'fundamental', 'insights', 'you', 'must', 'logged', 'view', 'these', 'articles', 'from', 'past', 'issues', 'wilmott', 'magazine']
classfication error ['yay', 'you', 'both', 'doing', 'fine', 'working', 'mba', 'design', 'strategy', 'cca', 'top', 'art', 'school', 'new', 'program', 'focusing', 'more', 'right', 'brained', 'creative', 'and', 'strategic', 'approach', 'management', 'the', 'way', 'done', 'today']
the error rate is 0.2
the error rate is 0.0
classfication error ['scifinance', 'now', 'automatically', 'generates', 'gpu', 'enabled', 'pricing', 'risk', 'model', 'source', 'code', 'that', 'runs', '300x', 'faster', 'than', 'serial', 'code', 'using', 'new', 'nvidia', 'fermi', 'class', 'tesla', 'series', 'gpu', 'scifinance', 'derivatives', 'pricing', 'and', 'risk', 'model', 'development', 'tool', 'that', 'automatically', 'generates', 'and', 'gpu', 'enabled', 'source', 'code', 'from', 'concise', 'high', 'level', 'model', 'specifications', 'parallel', 'computing', 'cuda', 'programming', 'expertise', 'required', 'scifinance', 'automatic', 'gpu', 'enabled', 'monte', 'carlo', 'pricing', 'model', 'source', 'code', 'generation', 'capabilities', 'have', 'been', 'significantly', 'extended', 'the', 'latest', 'release', 'this', 'includes']
the error rate is 0.1
the error rate is 0.0
classfication error ['yeah', 'ready', 'may', 'not', 'here', 'because', 'jar', 'jar', 'has', 'plane', 'tickets', 'germany', 'for']
classfication error ['home', 'based', 'business', 'opportunity', 'knocking', 'your', 'door', 'don抰', 'rude', 'and', 'let', 'this', 'chance', 'you', 'can', 'earn', 'great', 'income', 'and', 'find', 'your', 'financial', 'life', 'transformed', 'learn', 'more', 'here', 'your', 'success', 'work', 'from', 'home', 'finder', 'experts']
the error rate is 0.2