Python下的英文预处理

最新推荐文章于 2024-04-18 10:26:54 发布

xiaopihaierletian

最新推荐文章于 2024-04-18 10:26:54 发布

阅读量2.6k

点赞数

CC 4.0 BY-SA版权

分类专栏：自然语言处理开发基础

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.youkuaiyun.com/xiaopihaierletian/article/details/73556431

开发基础同时被 2 个专栏收录

146 篇文章

订阅专栏

自然语言处理

68 篇文章

订阅专栏

本文介绍了一种用于英文文本预处理的方法，包括文件读取、句子分割、内容清理、词性标注、分词、拼写检查、停用词去除、短词过滤及词干化等步骤。

一得到原始文本内容

[python] view plain copy

def FileRead(self,filePath):
f = open(filePath)
raw=f.read()
return raw

二分割成句子

[python] view plain copy

def SenToken(self,raw):#分割成句子
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
sents = sent_tokenizer.tokenize(raw)
return sents

三句子内容的清理，去掉数字标点和非字母字符

[python] view plain copy

def CleanLines(self,line):
identify = string.maketrans('', '')
delEStr = string.punctuation +string.digits #ASCII 标点符号，数字
cleanLine = line.translate(identify,delEStr) #去掉ASCII 标点符号和空格
cleanLine =line.translate(identify,delEStr) #去掉ASCII 标点符号
return cleanLine

四nltk.pos_tag进行词性标注

[python] view plain copy

def POSTagger(self,sent):
taggedLine=[nltk.pos_tag(sent) for sent in sents]
return taggedLine

五 nltk.word_tokenize分词

[python] view plain copy

def WordTokener(self,sent):#将单句字符串分割成词
result=''
wordsInStr = nltk.word_tokenize(sent)
return wordsInStr

六 enchant拼写检查

[python] view plain copy

def WordCheck(self,words):#拼写检查
d = enchant.Dict("en_US")
checkedWords=()
for word in words:
if not d.check(word):
d.suggest(word)
word=raw_input()
checkedWords = (checkedWords,'05')
return checkedWords

七去停用词和小写去短词

[python] view plain copy

def CleanWords(self,wordsInStr):#去掉标点符号，长度小于3的词以及non-alpha词，小写化
cleanWords=[]
stopwords = {}.fromkeys([ line.rstrip()for line in open(conf.PreConfig.ENSTOPWORDS)])
for words in wordsInStr:
cleanWords+= [[w.lower() for w in words if w.lower() not in stopwords and 3<=len(w)]]
return cleanWords

八使用Wordnet进行词干化

[python] view plain copy

def StemWords(self,cleanWordsList):
stemWords=[]
# porter = nltk.PorterStemmer()#有博士说这个词干化工具效果不好，不是很专业
# result=[porter.stem(t) for t incleanTokens]
for words in cleanWordsList:
stemWords+=[[wn.morphy(w) for w in words]]
return stemWords

九完整代码

[python] view plain copy

#coding=utf-8
'''''
Created on 2014-3-20
英文的词干化和去停用词
@author: liTC
'''
import nltk
# import enchant
import string
import re
import os
from config import Config as conf
from nltk.corpus import wordnet as wn
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class EnPreprocess:
'''''整体流程：
读取文件：FileRead（）filepath to raw
分割成句子:SenToken()raw to sents
(词性标注):POSTagger()sent to words[]
句子分割成词:TokenToWords()将句子分割成词 sent to word[]
（拼写检查）：WordCheck() 错误的去掉或是等人工改正
去掉标点，去掉非字母内容:CleanLines()句子，line to cleanLine
去掉长度小于3的词，小写转换，去停用词：CleanWords(),words[] to cleanWords[]
词干化:StemWords()把词词干化返回，words to stemWords
二次清理:再执行一次CleanWords()，使句子更加纯净
'''
def__init__(self):
print'English token and stopwords remove...'
defFileRead(self,filePath):#读取内容
f =open(filePath)
raw=f.read()
return raw
defWriteResult(self,result,resultPath):
self.mkdir(str(resultPath).replace(str(resultPath).split('/')[-1],''))
f=open(resultPath,"w") #将结果保存到另一个文档中
f.write(str(result))
f.close()
defSenToken(self,raw):#分割成句子
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
sents =sent_tokenizer.tokenize(raw)
return sents
def POSTagger(self,sent):
taggedLine=[nltk.pos_tag(sent) for sent in sents]
returntaggedLine
defWordTokener(self,sent):#将单句字符串分割成词
result=''
wordsInStr= nltk.word_tokenize(sent)
returnwordsInStr
defWordCheck(self,words):#拼写检查
d =enchant.Dict("en_US")
checkedWords=()
for word inwords:
if notd.check(word):
d.suggest(word)
word=raw_input()
checkedWords = (checkedWords,'05')
returncheckedWords
defCleanLines(self,line):
identify =string.maketrans('', '')
delEStr =string.punctuation + string.digits #ASCII 标点符号，数字
# cleanLine= line.translate(identify, delEStr) #去掉ASCII 标点符号和空格
cleanLine =line.translate(identify,delEStr) #去掉ASCII 标点符号
returncleanLine
defCleanWords(self,wordsInStr):#去掉标点符号，长度小于3的词以及non-alpha词，小写化
cleanWords=[]
stopwords ={}.fromkeys([ line.rstrip() for line in open(conf.PreConfig.ENSTOPWORDS)])
for wordsin wordsInStr:
cleanWords+= [[w.lower() for w in words if w.lower() not in stopwordsand 3<=len(w)]]
returncleanWords
defStemWords(self,cleanWordsList):
stemWords=[]
# porter =nltk.PorterStemmer()#有博士说这个词干化工具效果不好，不是很专业
# result=[porter.stem(t) for t in cleanTokens]
for wordsin cleanWordsList:
stemWords+=[[wn.morphy(w) for w in words]]
returnstemWords
defWordsToStr(self,stemWords):
strLine=[]
for wordsin stemWords:
strLine+=[w for w in words]
returnstrLine
defmkdir(self,path):
# 去除首位空格
path=path.strip()
# 去除尾部 \ 符号
path=path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判断结果
if notisExists:
# 如果不存在则创建目录
printpath+' 创建成功'
# 创建目录操作函数
os.makedirs(path)
returnTrue
else:
# 如果目录存在则不创建，并提示目录已存在
printpath+' 目录已存在'
returnFalse
defEnPreMain(self,dir):
forroot,dirs,files in os.walk(dir):
foreachfiles in files:
croupPath=os.path.join(root,eachfiles)
printcroupPath
resultPath=conf.PreConfig.NLTKRESULTPATH+croupPath.split('/')[-2]+'/'+croupPath.split('/')[-1]
raw=self.FileRead(croupPath).strip()
sents=self.SenToken(raw)
# taggedLine=self.POSTagger(sents)#暂不启用词性标注
cleanLines=[self.CleanLines(line) for line in sents]
words=[self.WordTokener(cl) for cl in cleanLines]
# checkedWords=self.WordCheck(words)#暂不启用拼写检查
cleanWords=self.CleanWords(words)
stemWords=self.StemWords(cleanWords)
# cleanWords=self.CleanWords(stemWords)#第二次清理出现问题，暂不启用
strLine=self.WordsToStr(stemWords)
self.WriteResult(strLine,resultPath)#一个文件暂时存成一行
defStandardTokener(self,raw):
result=''
#还没弄好
returnresult
enPre=EnPreprocess()
enPre.EnPreMain(conf.PreConfig.ENCORUPPATH)

PS：一直还没用好Stanford的那个工具包，谁用过教我一下吧

评论 1

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。