import os, jieba
VERBOSE = False
class Tree:
def __init__(self,entity_tree_list, suffix_num=2):
self.words = []
self.words_idx = {}
self.tnext = [{}]
self.tword = [[]]
self.add_words(entity_tree_list,
suffix_num)
self.stopwords = {"的", "是", "有", "多少", "哪些", "和", "什么", "谁", "这"}
def add_words(self, words, suffix_num=2):
'''
将一个词语(及其后缀)加入字典树
:param words: 词语列表,每个元素包括词语和一个特征
:param suffix_num: 加入字典树的后缀个数,0表示模糊串的第一个字和实体要一致,1表示模糊串的第一个字在实体的前两个字内,依次类推
:return:
'''
ii = 0
for w in words:
ii += 1
if VERBOSE and ii % 10000 == 0: print('Add %d words' % ii)
id = len(self.words)
self.words.append(w)
w = w
self.words_idx[w] = id
for offset in range(min(suffix_num + 1, len(w))):
z = 0
for c in w[offset:]:
if not c in self.tnext[z]:
self.tnext[z][c] = len(self.tword)
self.tword.append([])
self.tnext.append({})
z = self.tnext[z][c]
self.tword[z].append((id, offset))
if VERBOSE: print('Added %d words, %d nodes' % (ii, len(self.tword)))
def approx(self, word, rep_pun=1, del_pun=0.3, add_pun=1, pun_limit=2, thres=0.4, order_pun=0.01, approx_flag=True):
'''
近似单词匹配
:param word: 匹配词
:param rep_pun: 替换惩罚
:param del_pun: 删除惩罚
:param add_pun: 添加惩罚
:param pun_limit: 惩罚上限
:param thres: 编辑距离相似度阈值
:param order_pun: 顺序惩罚,即前面的错误会有更多的惩罚
:param verbose: 输出详细信息
:param approx_flag: 若置为False,则假设第一个字是对的,可提升效率但是会miss掉前缀丢失的匹配,需要配合后缀食用
:return: 列表,每个元素包含一个匹配结果和其编辑距离相似度分数
'''
def push(q, arrv_dict, n, l, p):
if arrv_dict.get((n, l), pun_limit + 1e-6) > p:
q.append((n, l, p))
arrv_dict[(n, l)] = p
if pun_limit >= len(word): pun_limit = len(word) - 1
qh = 0
w = word
q = [(0, 0, 0)]
obj = {}
arrv_dict = {}
ll = len(w)
while qh < len(q):
z, i, b = q[qh]
qh += 1
if b > pun_limit: continue
if i >= len(w):
for tw, offset in self.tword[z]:
mw = self.words[tw]
b += offset * del_pun
s = 1 - b / max(len(w), len(mw))
if s > thres and s > obj.get(mw, 0):
obj[mw] = s
c = w[i] if i < len(w) else None
next = self.tnext[z].get(c, -1)
if next >= 0: push(q, arrv_dict, next, i + 1, b)
if approx_flag:
for ch, nx in self.tnext[z].items():
push(q, arrv_dict, nx, i, b + del_pun + order_pun * max(ll - i, 0))
if c != ch: push(q, arrv_dict, nx, i + 1, b + rep_pun + order_pun * max(ll - i, 0))
push(q, arrv_dict, z, i + 1, b + add_pun + order_pun * (ll - i))
approx_flag = True
ret = sorted(obj.items(), key=lambda x: -x[1])
if VERBOSE: print(word, qh, ret[:10])
return ret
def fuzzy_search(self, Q, match_word_num=5, min_len=4, blacklist=set(), hmm=True, **fuzzy_params):
'''
:param Q: 待匹配文本,字符串或者分词后的词列表
:param match_word_len: 最长匹配词数
:param min_len: 最短匹配词长度
:param hmm: 设置为False则分词粒度更细,若改为False建议提升match_word_num至少为6
:param fuzzy_params: 模糊匹配参数
'''
ss = jieba.lcut(Q, HMM=hmm) if type(Q) == str else Q
ret = {}
overlaps = 0
for i in range(len(ss)):
if ss[i] not in self.stopwords:
for j in range(min(i + match_word_num, len(ss)), i, -1):
if j <= overlaps: break
if ss[j - 1] in self.stopwords: continue
subs = "".join(ss[i:j])
if len(subs) < min_len or subs in blacklist or subs.startswith("基金"): continue
res = self.approx(subs, approx_flag=False, **fuzzy_params)
r = [(m, s) for m, s in res if m in self.words_idx]
if r:
ret[subs] = r
overlaps = j
return ret
def exact_search(self, Q, longest_mention=20):
splits = [0]
lasteng = False
Q = Q
for ii in range(len(Q)):
if 'a' <= Q[ii] <= 'z' or '0' <= Q[ii] <= '9' or 'A' <= Q[ii] <= 'Z':
if lasteng: continue
lasteng = True
else:
lasteng = False
if ii > 0:
splits.append(ii)
splits.append(len(Q))
ret = set()
upper = longest_mention + 1
for ik, ii in enumerate(splits):
for jk in range(ik + 1, ik + upper):
if jk >= len(splits): break
jj = splits[jk]
subs = Q[ii:jj]
if len(subs) > len(Q) or len(subs) < 1: continue
ret.add(subs)
return [r for r in ret if r in self.words_idx]