分词词典:
import os
#正向向最大匹配法
def MM(text):
#读取词典
dictionary = []
dic_path = r'F:\大三下\自然语言处理\chapter-4\dic.utf8'
# 在windows上使用open打开utf-8编码的txt文件时开头会有一个多余的字符
# 它叫BOM,是用来声明编码等信息的,但python会把它当作文本解析
# 解决办法:open的encoding参数
# for line in open(, encoding='utf_8_sig' ):
for line in open(dic_path,'r',encoding='utf_8_sig'):
line = line.strip()
if not line:
continue
dictionary.append(line)
dictionary = list(set(dictionary))
#获取词典最大长度
max_length = 0
word_length = []
for word in dictionary:
word_length.append(len(word))
max_length = max(word_length)
#切分文本
cut_list = []
text_length = len(text)
while text != '':
if text_length < max_length:
max_length = text_length
new_word = text[: max_length] # 以最长分词词典的词组长度最先匹配
while new_word not in dictionary:
new_word = new_word[: len(new_word)-1] # 不匹配时长度减一
if len(new_word) == 1 :
break
cut_list.append(new_word) # 向列表中加入划分文本单词或词组
text = text[len(new_word):] # 匹配时切除已匹配词
return cut_list
MM("北京市民办高中")
分词结果:
import os
#逆向最大匹配法
def RMM(text):
#读取词典
dictionary = []
dic_path = r'F:\大三下\自然语言处理\chapter-4\dic.utf8'
# 在windows上使用open打开utf-8编码的txt文件时开头会有一个多余的字符
# 它叫BOM,是用来声明编码等信息的,但python会把它当作文本解析
# 解决办法:open的encoding参数
# for line in open(, encoding='utf_8_sig' ):
for line in open(dic_path,'r',encoding='utf_8_sig'):
line = line.strip()
if not line:
continue
dictionary.append(line)
dictionary = list(set(dictionary))
#获取词典最大长度
max_length = 0
word_length = []
for word in dictionary:
word_length.append(len(word))
max_length = max(word_length)
#切分文本
cut_list = []
text_length = len(text)
while text_length > 0:
j = 0
for i in range(max_length, 0, -1):
if text_length - i < 0:
continue
# 从右到左依次配对分词表
new_word = text[text_length - i:text_length]
if new_word in dictionary:
cut_list.append(new_word) # 挑拣出文本中在分词表相同的词
text_length -= i
j += 1
break
if j == 0:
text_length -= 1
cut_list.append(new_word) # 添加单字未匹配字符
cut_list.reverse() # 将列表中元素倒序排列
return cut_list
RMM('北京市的民办高中')
分词结果:
调用以上函数实现双向匹配:
def BMM(text):
print("MM: ", MM("北京市民办高中"))
print("RMM: ", RMM("北京市民办高中"))
while len(MM(text)) <= len(RMM(text)) :
print(MM("北京市民办高中"))
print('BMM:', RMM(text))
BMM("北京市民办高中")
分词结果: