使用pyhanlp包识别命名实体(机构名,地名)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pyhanlp import *
# 目标词性列表
# nt: 机构团体名
# ns: 地名
# nsf: 音译地名
part_of_speech_list = ['nt', 'ns', 'nsf']
# 判断是否为中文单词
# 汉字编码范围\u4E00-\u9FA5
def is_all_chinese(strs):
flag = True
for ch in strs:
if not '\u4e00' <= ch <= '\u9fa5':
flag = False
return flag
def get_black_list():
black_list = []
with open('./blacklist') as f:
for line in f:
line = line.strip()
if line not in black_list:
black_list.append(line)
return black_list
black_list = get_black_list()
def entity_recognition(sentence):
entity = []
NLPTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
term_list = NLPTokenizer.segment(sentence)
for term in term_list:
word = term.word
nature = str(term.nature)
if (len(word) < 2) or (nature not in part_of_speech_list) \
or (word in black_list) or (word in entity):
continue
flag = is_all_chinese(word)
if not flag:
continue
entity.append(word)
return "|".join(entity)
if __name__ == "__main__":
text = '北约多国海军参加的“海上微风2021”演习已经开始,不过公开信息显示,美国派出的“罗斯”号导弹驱逐舰还窝在乌克兰港口,而英国士兵则在社交媒体上晒乌克兰方面供应的单调饮食,似乎颇有不满。'
tags = entity_recognition(text)
print(tags)
991

被折叠的 条评论
为什么被折叠?



