构建训练集和测试集
import os
def corpus(corpus_path):
data = open(corpus_path,encoding='utf-8') #人民日报数据
train = open("train.data",'w+',encoding='utf-8') #训练集
test = open("test.data",'w+',encoding='utf-8') #测试集
#划分训练集与测试集
pos=0
while True:
line = data.readline()
if line:
Test_pos = True if pos % 5 == 0 else False #抽样20%作为测试集使用
save = test if Test_pos else train #训练集与测试集分别保存
words = line.split()[1:]
for word in words:
if (word.endswith("/ns") == True): #是地名
if(len(word)==1): #只有一个字符
save.write(word + ' ' + 'S' + ' '+'S'+'\n')
else: #两个字符以上的词 ,如:北京市
save.write(word[0] + ' '+ 'B'+' '+ 'B' +'\n') #北 B B
for j in range(1,len(word)-3-1):
save.write(word[j] + ' '+ 'M'+' '+ 'I' +'\n') #京 M I
save.write(word[len(word)-4] + ' '+ 'E'+' '+ 'E' +'\n') #市 E I
else:
if(len(word)==1):
save.write(word + ' ' + 'S' + ' '+'O'+'\n')
else:
save.write(word[0] + ' '+ 'B'+ ' '+'O' +'\n')
for j in range(1,len(word)-1):
save.write(word[j] + ' '+ 'M'+' '+ 'O' +'\n')
save.write(word[len(word)-1] + ' '+ 'E'+' '+ 'O' +'\n')
save.write('\n')
pos+=1
else:
break
data.close()
train.close()
test.close()
if __name__ == '__main__':
os.chdir(r'F:\大三下\自然语言处理\chapter-5\测试2')
corpus('./people-daily.txt')# 获取语料库
评估
def Verification(result_path):
test = open(result_path,'r',encoding='utf-8')
test_name_tag = 0 #测试数据的地名标记数
predict_name_tag = 0 #预测的地名标记数
correct_name_tag = 0 #预测正确的地名标记数
all_tag=0 #全部标记数
pos = True
for l in test:
if l=='\n':
continue
_, a, g, r = l.strip().split()
if a in ('B','S'):
all_tag+=1
if r != g:
pos = False
if r == 'B' or r == 'S':
predict_name_tag += 1
if pos:
correct_name_tag +=1
pos = True
if g == 'B' or g == 'S':
test_name_tag += 1
#准确率
P = correct_name_tag/float(predict_name_tag)
#召回率
R = correct_name_tag/float(test_name_tag)
print('全部词数有:' + str(all_tag) + '个')
print('其中地名词有:' + str(test_name_tag) + '个')
print('准确率为:{}, 召回率为:{}, F值为:{}'.format(P, R, (2*P*R)/(P+R)))
test.close()
if __name__ == '__main__':
Verification(r'F:\大三下\自然语言处理\chapter-5\测试2\result')
实测
import os
import jieba.posseg as psg
os.chdir(r'F:\大三下\自然语言处理\chapter-5\测试2')
#地名识别的内容
sent = "钓鱼岛是中国的"
data = open("data.txt",'w+',encoding='utf-8')
save = open("sentence.data",'w+',encoding='utf-8')
for word,t in psg.cut(sent): #分词
data.write(word+"/"+t+" ")
data.close()
data2 = open(r'F:\大三下\自然语言处理\chapter-5\测试2\data.txt',encoding='utf-8')
line = data2.readline()
words = line.split()[0:]
for word in words:
if (word.endswith("/ns") == True): #是地名
if(len(word)==1): #只有一个字符
save.write(word + ' ' + 'S' + ' '+'S'+'\n')
else: #两个字符以上的词 ,如:北京市
save.write(word[0] + ' '+ 'B'+' '+ 'B' +'\n') #北 B B
for j in range(1,len(word)-3-1):
save.write(word[j] + ' '+ 'M'+' '+ 'I' +'\n') #京 M I
save.write(word[len(word)-4] + ' '+ 'E'+' '+ 'E' +'\n') #市 E I
else:
if(len(word)==1):
save.write(word + ' ' + 'S' + ' '+'O'+'\n')
else:
save.write(word[0] + ' '+ 'B'+ ' '+'O' +'\n')
for j in range(1,len(word)-1):
save.write(word[j] + ' '+ 'M'+' '+ 'O' +'\n')
save.write(word[len(word)-1] + ' '+ 'E'+' '+ 'O' +'\n')
data.close()
save.close()#写入文件
os.system("crf_test -m model sentence.data > sent_result") #进行地名识别并输出结果到文件中
#查看地名识别的效果
result = open("sent_result",'r+',encoding='utf-8')
result1 = result.readlines()
for i in range(0,len(result1)):
if len(result1[i])>1:
if result1[i][4] in 'SBIE':
print(result1[i][0])
if result1[i][4] in 'SE':
print("\n")
result.close()