KMP前缀和BM后缀算法比较
KMP
def f(t):#find every letter's pattern r in t
assert t != ''
len_t = len(t)
r = [-1] * len_t
r[0] = -1#i之前的最长相等前后缀
j = -1
i = 0
global matchtimes
matchtimes = 0
while (i < len_t - 1):#计算next[i+1]
matchtimes += 1
while j >= 0 and t[i] != t[j]:
matchtimes += 1
j = r[j]
j += 1
i += 1
if t[i] == t[j]:
r[i] = r[j]
r[i] = j
print(r)
return r
def knuthMatch(t, p):
next = f(p)
i, j = 0, 0
global matchtimes
#matchtimes = 0
while(i<len(p) and j<len(t)):
matchtimes += 1
if -1==i or p[i]==t[j]:
i += 1
j += 1
else:
i = next[i]
if i >= len(p):
print(j-len(p))
else:
print('No parttern')
BM
def boyer_moore(text, pattern):
global matchtimes
matchtimes = 0
skip = {}
for i in range(len(pattern) - 1):
matchtimes += 1
skip[pattern[i]] = len(pattern) - i - 1
print(skip)
i = len(pattern) - 1
while i < len(text):
matchtimes += 1
match = True
for j in range(len(pattern)):
matchtimes += 1
if text[i - j] != pattern[len(pattern) - 1 - j]:
match = False
break
if match:
print(i - len(pattern) + 1)
break
if text[i] in skip:
i += skip[text[i]]
else:
i += len(pattern)
比较
在while和for下添加matchtimes计数。BM更少。在待匹配字符串前后加长字符串,BM的表现都更好。
[-1, 0, 0, 0, 1, 1, 2, 1, 2]是KMP的next数组。{‘a’: 2, ‘b’: 1, ‘c’: 6} 是BM中各字符到尾部的距离。
knuthMatch(‘aabcbabcaabcaababc’,‘abcaababc’)
[-1, 0, 0, 0, 1, 1, 2, 1, 2]
9
matchtimes
34
boyer_moore(‘aabcbabcaabcaababc’,‘abcaababc’)
{‘a’: 2, ‘b’: 1, ‘c’: 6}
9
matchtimes
27
<算法与数据结构>KMP
BM应用滑动窗口
窗口宽度为匹配字符串的长度。操作时从窗口的后往前比较,如果不匹配窗口向后滑。滑动距离为当前字符到尾部的距离。
后话
使用哈希的rabin_karp_matching匹配算法。
更新
1,添加滑动窗口。