一 得到原始文本内容
- def FileRead(self,filePath):
- f = open(filePath)
- raw=f.read()
- return raw
二 分割成句子
- def SenToken(self,raw):#分割成句子
- sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
- sents = sent_tokenizer.tokenize(raw)
- return sents
三 句子内容的清理,去掉数字标点和非字母字符
- def CleanLines(self,line):
- identify = string.maketrans('', '')
- delEStr = string.punctuation +string.digits #ASCII 标点符号,数字
- cleanLine = line.translate(identify,delEStr) #去掉ASCII 标点符号和空格
- cleanLine =line.translate(identify,delEStr) #去掉ASCII 标点符号
- return cleanLine
四nltk.pos_tag进行词性标注
- def POSTagger(self,sent):
- taggedLine=[nltk.pos_tag(sent) for sent in sents]
- return taggedLine
五 nltk.word_tokenize分词
- def WordTokener(self,sent):#将单句字符串分割成词
- result=''
- wordsInStr = nltk.word_tokenize(sent)
- return wordsInStr
六 enchant拼写检查
- def WordCheck(self,words):#拼写检查
- d = enchant.Dict("en_US")
- checkedWords=()
- for word in words:
- if not d.check(word):
- d.suggest(word)
- word=raw_input()
- checkedWords = (checkedWords,'05')
- return checkedWords
七 去停用词和小写去短词
- def CleanWords(self,wordsInStr):#去掉标点符号,长度小于3的词以及non-alpha词,小写化
- cleanWords=[]
- stopwords = {}.fromkeys([ line.rstrip()for line in open(conf.PreConfig.ENSTOPWORDS)])
- for words in wordsInStr:
- cleanWords+= [[w.lower() for w in words if w.lower() not in stopwords and 3<=len(w)]]
- return cleanWords
八 使用Wordnet进行词干化
- def StemWords(self,cleanWordsList):
- stemWords=[]
- # porter = nltk.PorterStemmer()#有博士说这个词干化工具效果不好,不是很专业
- # result=[porter.stem(t) for t incleanTokens]
- for words in cleanWordsList:
- stemWords+=[[wn.morphy(w) for w in words]]
- return stemWords
九 完整代码
- #coding=utf-8
- '''''
- Created on 2014-3-20
- 英文的词干化和去停用词
- @author: liTC
- '''
- import nltk
- # import enchant
- import string
- import re
- import os
- from config import Config as conf
- from nltk.corpus import wordnet as wn
- import sys
- reload(sys)
- sys.setdefaultencoding('utf-8')
- class EnPreprocess:
- '''''整体流程:
- 读取文件:FileRead()filepath to raw
- 分割成句子:SenToken()raw to sents
- (词性标注):POSTagger()sent to words[]
- 句子分割成词:TokenToWords()将句子分割成词 sent to word[]
- (拼写检查):WordCheck() 错误的去掉或是等人工改正
- 去掉标点,去掉非字母内容:CleanLines()句子,line to cleanLine
- 去掉长度小于3的词,小写转换,去停用词:CleanWords(),words[] to cleanWords[]
- 词干化:StemWords()把词词干化返回,words to stemWords
- 二次清理:再执行一次CleanWords(),使句子更加纯净
- '''
- def__init__(self):
- print'English token and stopwords remove...'
- defFileRead(self,filePath):#读取内容
- f =open(filePath)
- raw=f.read()
- return raw
- defWriteResult(self,result,resultPath):
- self.mkdir(str(resultPath).replace(str(resultPath).split('/')[-1],''))
- f=open(resultPath,"w") #将结果保存到另一个文档中
- f.write(str(result))
- f.close()
- defSenToken(self,raw):#分割成句子
- sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
- sents =sent_tokenizer.tokenize(raw)
- return sents
- def POSTagger(self,sent):
- taggedLine=[nltk.pos_tag(sent) for sent in sents]
- returntaggedLine
- defWordTokener(self,sent):#将单句字符串分割成词
- result=''
- wordsInStr= nltk.word_tokenize(sent)
- returnwordsInStr
- defWordCheck(self,words):#拼写检查
- d =enchant.Dict("en_US")
- checkedWords=()
- for word inwords:
- if notd.check(word):
- d.suggest(word)
- word=raw_input()
- checkedWords = (checkedWords,'05')
- returncheckedWords
- defCleanLines(self,line):
- identify =string.maketrans('', '')
- delEStr =string.punctuation + string.digits #ASCII 标点符号,数字
- # cleanLine= line.translate(identify, delEStr) #去掉ASCII 标点符号和空格
- cleanLine =line.translate(identify,delEStr) #去掉ASCII 标点符号
- returncleanLine
- defCleanWords(self,wordsInStr):#去掉标点符号,长度小于3的词以及non-alpha词,小写化
- cleanWords=[]
- stopwords ={}.fromkeys([ line.rstrip() for line in open(conf.PreConfig.ENSTOPWORDS)])
- for wordsin wordsInStr:
- cleanWords+= [[w.lower() for w in words if w.lower() not in stopwordsand 3<=len(w)]]
- returncleanWords
- defStemWords(self,cleanWordsList):
- stemWords=[]
- # porter =nltk.PorterStemmer()#有博士说这个词干化工具效果不好,不是很专业
- # result=[porter.stem(t) for t in cleanTokens]
- for wordsin cleanWordsList:
- stemWords+=[[wn.morphy(w) for w in words]]
- returnstemWords
- defWordsToStr(self,stemWords):
- strLine=[]
- for wordsin stemWords:
- strLine+=[w for w in words]
- returnstrLine
- defmkdir(self,path):
- # 去除首位空格
- path=path.strip()
- # 去除尾部 \ 符号
- path=path.rstrip("\\")
- # 判断路径是否存在
- # 存在 True
- # 不存在 False
- isExists=os.path.exists(path)
- # 判断结果
- if notisExists:
- # 如果不存在则创建目录
- printpath+' 创建成功'
- # 创建目录操作函数
- os.makedirs(path)
- returnTrue
- else:
- # 如果目录存在则不创建,并提示目录已存在
- printpath+' 目录已存在'
- returnFalse
- defEnPreMain(self,dir):
- forroot,dirs,files in os.walk(dir):
- foreachfiles in files:
- croupPath=os.path.join(root,eachfiles)
- printcroupPath
- resultPath=conf.PreConfig.NLTKRESULTPATH+croupPath.split('/')[-2]+'/'+croupPath.split('/')[-1]
- raw=self.FileRead(croupPath).strip()
- sents=self.SenToken(raw)
- # taggedLine=self.POSTagger(sents)#暂不启用词性标注
- cleanLines=[self.CleanLines(line) for line in sents]
- words=[self.WordTokener(cl) for cl in cleanLines]
- # checkedWords=self.WordCheck(words)#暂不启用拼写检查
- cleanWords=self.CleanWords(words)
- stemWords=self.StemWords(cleanWords)
- # cleanWords=self.CleanWords(stemWords)#第二次清理出现问题,暂不启用
- strLine=self.WordsToStr(stemWords)
- self.WriteResult(strLine,resultPath)#一个文件暂时存成一行
- defStandardTokener(self,raw):
- result=''
- #还没弄好
- returnresult
- enPre=EnPreprocess()
- enPre.EnPreMain(conf.PreConfig.ENCORUPPATH)
PS:一直还没用好Stanford的那个工具包,谁用过教我一下吧