# lemmaDict = IndexBuilder(r'E:\mdx-test\牛津高阶英语学习词典第八版.mdx') # 108749, 貌似不错,返回 ['@@@LINK=fence\r\n'))))))
# def lemmatize(word):
# lowercasedWord = word.lower()
# dictResult = mdict.mdx_lookup(lowercasedWord) # 对于牛津词典,Resisting直接查找所没有的,所以要转小写;
# # print(dictResult)
# # print(len(dictResult))
# # print('dictResult------------------', word, mdict._mdx_file, repr(dictResult))
# if dictResult == [] or len(dictResult) == 2: # dictResult是[]:错误单词intuitiv6e、confpiracy或teaching posts;==2比如wanted的情况,可以成为一个独立单词:受通缉得的,也可以是want的过去式,目前选择前者;# integrated之类的词会有重定向和释义(在后)这两项
# lemma = None # 因为大多数单词都是小写形式,所以转成小写形式返回
# elif '@@@LINK=' in dictResult[0]: # positioned、fenced
# # print('-----hunspell------>', hunSpell.stem(newWord.lower()))
# lemma = dictResult[0][8:-1].rstrip() # 有的词典的末尾是\n(牛津高阶英语学习词典第八版.mdx),有的是\r\n(CollinsCOBUILDOverhaul+V+2-00.mdx),所以统一strip
# # print(word, '-----ox------>', stem)
# else: # 返回释义的情况;如good
# lemma = None
# return lemma
早期的牛津lemmatize
最新推荐文章于 2022-04-07 22:17:05 发布