编辑距离
是针对二个字符串的差异程度的量化量测,量测方式是看至少需要多少次的处理才能将一个字符串变成另一个字符串。
编辑距离有几种不同的定义,差异在可以对字符串进行的处理。
- 在莱文斯坦距离中,可以删除、加入、取代字符串中的任何一个字元
- Damerau-Levenshtein 距离是一种莱文斯坦距离的变种,允许字符转置,如【AB→BA】的距离是1
- LCS(最长公共子序列)距离只允许删除、加入字元
- Jaro距离只允许字符转置
- 汉明距离只允许取代字元
遍历词库,全量计算编辑距离
from nltk.corpus import words # 英文词库,需要下载
words = set(words.words())
print('词汇量:%d' % len(words))
def edit_distance(w1, w2):
l1, l2 = len(w1) + 1, len(w2) + 1
matrix = [[0 for j in range(l2)] for i in range(l1)]
for i in range(l1):
matrix[i][0] = i
for j in range(l2):
matrix[0][j] = j
for i in range(1, l1):
for j in range(1, l2):
delta = 0 if w1[i - 1] == w2[j - 1] else 1
matrix[i][j] = min(matrix[i - 1][j - 1] + delta,
matrix[i - 1][j] + 1,
matrix[i][j - 1] + 1)
return matrix[-1][-1]
while True:
w1 = input('输入:').strip()
if w1 in words:
print('合法单词')
else:
suggestion = ' '.join(
w2 for w2 in words if edit_distance(w1, w2) < 2)
print(suggestion)

先生成指定编辑距离的词,再行匹配
from nltk.corpus import wordnet # 单词合法性判断,需要下载
correct = lambda w_set: ' '.join(w for w in w_set if wordnet.synsets(w))
letters = 'abcdefghijklmnopqrstuvwxyz'
# 生成编辑距离为1的单词
def edit_one(word):
le = len(word)
adds = {word[:i] + c + word[i:] for i in range(le + 1) for c in letters}
deletes = {word[:i] + word[i + 1:] for i in range(le)}
replaces = {word[:i] + c + word[i + 1:] for i in range(le) for c in letters}
return adds | deletes | replaces
# 生成编辑距离为2的单词
def edit_two(word):
s = edit_one(word)
for w in edit_one(word):
s |= edit_one(w)
return s
while True:
w1 = input('输入:').strip()
if wordnet.synsets(w1):
print('合法单词')
else:
suggestion = correct(edit_two(w1))
print(suggestion)

时间复杂度比较
from time import time
from nltk.corpus import words
words = set(words.words())
print('词汇量:%d' % len(words))
letters = 'abcdefghijklmnopqrstuvwxyz'
def timer(f, w):
t = time()
result = f(w)
print('%s:\033[033m%.2f秒\033[0m' %
(f.__doc__, time() - t))
return result
def edit_distance(w1, w2):
"""编辑距离计算"""
l1, l2 = len(w1) + 1, len(w2) + 1
matrix = [[0 for j in range(l2)] for i in range(l1)]
for i in range(l1):
matrix[i][0] = i
for j in range(l2):
matrix[0][j] = j
for i in range(1, l1):
for j in range(1, l2):
delta = 0 if w1[i - 1] == w2[j - 1] else 1
matrix[i][j] = min(matrix[i - 1][j - 1] + delta,
matrix[i - 1][j] + 1,
matrix[i][j - 1] + 1)
return matrix[-1][-1]
def edit_one(word):
"""生成编辑距离为1的词"""
le = len(word)
adds = {word[:i] + c + word[i:] for i in range(le + 1) for c in letters}
deletes = {word[:i] + word[i + 1:] for i in range(le)}
replaces = {word[:i] + c + word[i + 1:] for i in range(le) for c in letters}
return adds | deletes | replaces
def edit_two(word):
"""生成编辑距离为2的词"""
s = edit_one(word)
for w in edit_one(word):
s |= edit_one(w)
return s
def suggest_1(w1):
"""【方法1】遍历词库,返回编辑距离<=2的词"""
return {w2 for w2 in words if edit_distance(w1, w2) < 3}
def suggest_2(word):
"""【方法2】生成编辑距离<=2的词后匹配词库"""
return {w for w in edit_two(word) if w in words}
while True:
w = input('输入:').strip()
if w in words:
print('合法单词')
else:
print(timer(suggest_1, w))
print(timer(suggest_2, w))

915

被折叠的 条评论
为什么被折叠?



