1.3.1 使用正则表达式替换单词
# 创建replacers.py文件,被调用
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
# 创建一个替换类
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in
patterns]
# 定义一个替换方法
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
1.3.2 用其他文本替换文本的示例
# 调用替换方法
# Don't -> Do not
# (r'(\w+)\'ve', '\g<1> have')
import nltk
from replacers import RegexpReplacer
def main1():
replacer = RegexpReplacer()
print(replacer.replace("Don't hesitate to ask questions"))
main1()
# 执行结果
Do not hesitate to ask questions
1.3.3 在执行切分前先执行替换操作
# 标识符替换操作可以在切分前执行,以避免在切分缩略词的过程中出现问题
import nltk
from nltk.tokenize import word_tokenize
from replacers import RegexpReplacer
def main2():
replacer = RegexpReplacer()
print(word_tokenize("Don't hesitate to ask questions"))
print(word_tokenize(replacer.replace("Don't hesitate to ask questions")))
main2()
# 执行结果
['Do', "n't", 'hesitate', 'to', 'ask', 'questions']
['Do', 'not', 'hesitate', 'to', 'ask', 'questions']
1.3.4 去除重复字符
# 使用反向引用方法来去消除重复的字符
# 定义一个重复字符替换类
class RepeatReplacer(object):
def __init__(self):
self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
self.repl = r'\1\2\3'
def replace(self, word):
repl_word = self.repeat_regexp.sub(self.repl, word)
if repl_word != word:
return self.replace(repl_word)
else:
return repl_word
1.3.5 去除重复字符的示例
import nltk
from replacers import RepeatReplacer
def main3():
replacer = RepeatReplacer()
print(replacer.replace('lotttt'))
print(replacer.replace('ohhhhh'))
print(replacer.replace('ooohhhhh'))
print(replacer.replace('happy'))
main3()
# 执行结果
lot
oh
ooh
happy
1.3.6 用单词的同义词替换
# 定义一个同义词替换类
class WordReplacer(object):
def __init__(self, word_map):
self.word_map = word_map
def replace(self, word):
return self.word_map.get(word, word)
1.3.7 用单词的同义词替换的示例
# 如果给定同义词,则单词将被同义词替换,反之,则无。
import nltk
from replacers import WordReplacer
def main4():
replacer = WordReplacer({'congrats': 'congratulations'})
print(replacer.replace('congrats'))
print(replacer.replace('maths'))
main4()
# 执行结果
congratulations
maths