# NLTK中的nltk.metrics包用于提供各种评估或相似性度量
from __future__ import print_function
from nltk.metrics import *
def main1():
training = 'PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
testing = 'PERSON OTHER OTHER OTHER OTHER OTHER'.split()
print(training)
print(testing)
print("ACCURACY:")
print(accuracy(training, testing))
print("===" * 25)
trainset = set(training)
testset = set(testing)
print(trainset)
print(testset)
print("PRECISION:")
print(precision(trainset, testset))
print("RECALL:")
print(recall(trainset, testset))
print("F_MEASURE:")
print(f_measure(trainset, testset))
main1()
# 执行结果
['PERSON', 'OTHER', 'PERSON', 'OTHER', 'OTHER', 'ORGANIZATION']
['PERSON', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER']
ACCURACY:
0.6666666666666666
===========================================================================
{'ORGANIZATION', 'PERSON', 'OTHER'}
{'PERSON', 'OTHER'}
PRECISION:
1.0
RECALL:
0.6666666666666666
F_MEASURE:
0.8
1.5.1 使用编辑距离算法执行相似性度量
import nltk
from nltk.metrics import *
def main2():
# 3步
print(edit_distance("relate", "relation"))
# 7步
print(edit_distance("suggestion", "calculation"))
main2()
# 执行结果
3
7
1.5.2 使用Jaccard系数执行相似性度量
# 两个集合 X 和 Y 交集的相似程度
import nltk
from nltk.metrics import *
def main3():
X1 = {20, 30}
Y2 = {20, 30}
print("完全相同:")
print(jaccard_distance(X1, Y2))
X1 = {20, 30}
Y2 = {10, 40}
print("完全不同")
print(jaccard_distance(X1, Y2))
X1 = {20, 30}
Y2 = {20, 40}
print("部分相同")
print(jaccard_distance(X1, Y2))
main3()
# 执行结果
完全相同:
0.0
完全不同
1.0
部分相同
0.6666666666666666
1.5.3 使用Smith Waterman 距离算法执行相似度计算
# 类似于编辑距离算法
# 在nltk.metrics包中没有找到相关模块
1.5.4 其他字符串相似性度量
# 二进制距离
# 两个标签相同,返回值为0.0, 否则,它的返回值为1.0
import nltk
from nltk.metrics import *
def main4():
X = {30, 50}
Y = {30, 50}
print("相同")
print(binary_distance(X, Y))
X = {10, 20, 30, 40}
Y = {30, 50, 70}
print("不同")
print(binary_distance(X, Y))
main4()
# 执行结果
相同
0.0
不同
1.0
# 多个标签,Masi距离基于部分协议
# 有详细算法
import nltk
from nltk.metrics import *
def main5():
X = {30, 40}
Y = {30, 40, 50, 60}
print(masi_distance(X, Y))
main5()
# 执行结果
0.335