字典数Trie和后缀数组suffix array是处理字符串操作的有力工具。
下面是ruby的实现:
class Trie
attr_accessor :value
def initialize
@children = Hash.new {|h,k| h[k] = Trie.new }
end
def []=(key,value)
insert(key,0,value)
key
end
def [](key)
get(key,0)
end
def each &block
_each &block
end
def keys
inject([]){|keys,(k,v)| keys << k; keys}
end
def values
inject([]){|values,(k,v)| values << v; values}
end
def each_key &block
keys.each &block
end
def each_value &block
values.each &block
end
def inspect(indent = 0)
buffer = ''
indent_str = ' ' * indent
buffer << indent_str + "value: #{value}\n" if value
return buffer unless @children.size > 0
@children.each do |k,c|
buffer << "#{indent_str}#{k} =>\n"
buffer << c.inspect(indent + 2)
end
buffer
end
protected
def _each(key='',&block)
block.call(key,value) if key != '' and value
@children.keys.sort.each do |k|
@children[k]._each(key + k, &block)
end
end
def insert(key,offset,value)
if offset == key.length - 1
@children[key[offset,1]].value = value
else
@children[key[offset,1]].insert(key,offset+1,value)
end
end
def get(key,offset)
if offset == key.length - 1
@children[key[offset,1]].value
else
return nil unless @children.hash_key?(key[offset,1])
@children[key[offset,1]].get(key,offset + 1)
end
end
end
t = Trie.new
t['cat'] = true
t['cab'] = true
t['cate'] = true
t['bob'] = true
p t
suffix array,我们使用简单的排序方法,算法复杂度取决于排序的复杂度。使用快排,平均复杂度可以达到O(N*logN),最坏复杂度为O(N*N)。没有使用更高效O(N*logN)的倍增法等构建。
class SuffixArray
def initialize(str)
@str = str
@suffix_array = Array.new
last_index = str.length - 1
(0..last_index).each do |i|
suffix = str[i..last_index]
position = i
@suffix_array << { :suffix => suffix, :position => position }
end
@suffix_array.sort! {|a,b| a[:suffix] <=> b[:suffix] }
end
def find_substr(substr)
high = @suffix_array.length - 1
low = 0
pos = -1
while( low <= high )
mid = (high + low ) / 2
suffix = @suffix_array[mid][:suffix]
pre_suffix = suffix[0...substr.length]
if(pre_suffix > substr)
high = mid - 1
elsif(pre_suffix < substr)
low = mid + 1
else
return @suffix_array[mid][:position]
end
end
return -1
end
end
sa = SuffixArray.new("The type of query that benefits most from caching")
puts sa.find_substr("query that")