词性标注
import jieba.posseg as psg #加载 jieba 模块中的分词函数
sent = "去森林公园爬山。"
for w,t in psg.cut(sent):
print(w,"/",t)
命名实体识别
构建训练集和测试集:
import os
def corpus(corpus_path):
data = open(corpus_path,encoding='utf-8') #人民日报数据
train = open("train.data",'w+',encoding='utf-8') #训练集
test = open("test.data",'w+',encoding='utf-8') #测试集
#划分训练集与测试集
pos=0
while True:
line = data.readline()
if line:
Test_pos = True if pos % 5 == 0 else False #抽样20%作为测试集使用
save = test if Test_pos else train #训练集与测试集分别保存
words = line.split()[1:]
for word in words:
if (word.endswith("/nr") == True): #是人名
if(len(word)==1): #只有一个字符,如:信
save.write(word + ' ' + 'S' + ' '+'B'+'\n')
else: #两个字符以上的词 ,如:袁隆平
save.write(word[0] + ' '+ 'B'+' '+ 'B' +'\n') #袁 B B
for j in range(1,len(word)-1-3):
save.write(word[j] + ' '+ 'M'+' '+ 'I' +'\n') #隆 M I
save.write(word[len(word)-4] + ' '+ 'E'+' '+ 'I' +'\n') #平 E I
else:
if(len(word)==1):
save.write(word + ' ' + 'S' + ' '+'O'+'\n')
else:
save.write(word[0] + ' '+ 'B'+ ' '+'O' +'\n')
for j in range(1,len(word)-1):
save.write(word[j] + ' '+ 'M'+' '+ 'O' +'\n')
save.write(word[len(word)-1] + ' '+ 'E'+' '+ 'O' +'\n')
save.write('\n')
pos+=1
else:
break
data.close()
train.close()
test.close()
if __name__ == '__main__':
os.chdir(r'F:\大三下\自然语言处理\测试')
corpus('./people-daily.txt')
用 CRF++工具 训练和测试模型
在此处打开cmd输入命令,进行训练和测试
测试评估:
def Verification(result_path):
test = open(result_path,'r',encoding='utf_8_sig')
test_name_tag = 0 #测试数据的人名标记数
predict_name_tag = 0 #预测的人名标记数
correct_name_tag = 0 #预测正确的人名标记数
all_tag=0 #全部标记数
pos = True
for l in test:
if l=='\n':
continue
_, a, g, r = l.strip().split()
if a in ('B','S'):
all_tag+=1
if r != g:
pos = False
if r == 'B':
predict_name_tag += 1
if pos:
correct_name_tag +=1
pos = True
if g == 'B':
test_name_tag += 1
#准确率
P = correct_name_tag/float(predict_name_tag)
#召回率
R = correct_name_tag/float(test_name_tag)
print('全部词数有:' + str(all_tag) + '个')
print('其中人名词有:' + str(test_name_tag) + '个')
print('准确率为:{}, 召回率为:{}, F值为:{}'.format(P, R, (2*P*R)/(P+R)))
test.close()
if __name__ == '__main__':
Verification(r'F:\大三下\自然语言处理\测试\result')
实测:
import os
import jieba.posseg as psg
os.chdir(r'F:\大三下\自然语言处理\测试')
#人名识别的内容
sent = "法外狂徒老张三"
data = open("data.txt",'w+',encoding='utf-8')
save = open("sentence.data",'w+',encoding='utf_8_sig')
for word,t in psg.cut(sent): #分词
data.write(word+"/"+t+" ")
data.close()
data2 = open(r'F:\大三下\自然语言处理\测试\data.txt',encoding='utf-8')
line = data2.readline()
words = line.split()[0:]
for word in words:
if (word.endswith("/nr") == True):
if(len(word)==1): #只有一个字符
save.write(word + ' ' + 'S' + ' '+'S'+'\n')
else: #两个字符以上的词 ,如:谢广坤
save.write(word[0] + ' '+ 'B'+' '+ 'B' +'\n')
for j in range(1,len(word)-3-1):
save.write(word[j] + ' '+ 'M'+' '+ 'I' +'\n')
save.write(word[len(word)-4] + ' '+ 'E'+' '+ 'I' +'\n')
else:
if(len(word)==1):
save.write(word + ' ' + 'S' + ' '+'O'+'\n')
else:
save.write(word[0] + ' '+ 'B'+ ' '+'O' +'\n')
for j in range(1,len(word)-1):
save.write(word[j] + ' '+ 'M'+' '+ 'O' +'\n')
save.write(word[len(word)-1] + ' '+ 'E'+' '+ 'O' +'\n')
data.close()
save.close()#写入文件
os.system("crf_test -m model sentence.data > sent_result") #进行人名识别并输出结果到文件中
#查看人名识别的效果
result = open("sent_result",'r+',encoding='utf_8_sig')
result1 = result.readlines()
for i in range(len(result1)):
if len(result1[i])>1:
if result1[i][4] in 'BI':
print(result1[i][0])
result.close()