7.4 语言结构中的递归 & 7. 5 命名实体识别 & 7.6 关系提取
7.4 语言结构中的递归
1、用级联分块器构建嵌套结构
import nltk
grammar = r'''
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
'''
cp = nltk.RegexpParser(grammar)
sentence = [('Marry', 'NN'),('saw', 'VBD'),('the', 'DT'),('cat', 'NN'),
('sit', 'VB'),('on', 'IN'),('the', 'DT'),('mat', 'NN')]
print(cp.parse(sentence))
sentence = [('John', 'NNP'),('thinks', 'VBZ'),('Marry', 'NN'),('saw', 'VBD'),('the', 'DT'),('cat', 'NN'),
('sit', 'VB'),('on', 'IN'),('the', 'DT'),('mat', 'NN')]
print(cp.parse(sentence))
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
2、树状图
tree1 = nltk.Tree('NP', ['Alice'])
print(tree1)
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print(tree2)
tree3 = nltk.Tree('VP', ['chased', tree2])
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4)
print(tree4[1])
print(tree4[1].label())
print(tree4.leaves())
tree4[1][1][1]
tree4.draw()
tree3.draw()
3、树遍历
def traverse(t):
try:
t.label()
except AttributeError:
print(t,end = ' ')
else:
print('(', t.label(), end = ' ')
for child in t:
traverse(child)
print(')', end = ' ')
traverse(tree4)
7.5 命名实体识别
·仅仅凭借 地名辞典 来识别众多实体是不可能的。
·我们需要能够识别 多标识符序列的开头和结尾。
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=True))
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=0))
7.6 关系提取
import re
IN =re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
print(nltk.sem.rtuple(rel))
from nltk.corpus import conll2002
vnv = """
(
is/V|
was/V|
werd/V|
wordt/V
)
.*
van/Prep
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
for r in nltk.sem.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
print(nltk.sem.clause(r, relsym = 'VAN'))
from nltk.corpus import conll2002
vnv = """
(
is/V|
was/V|
werd/V|
wordt/V
)
.*
van/Prep
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
for r in nltk.sem.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
print(nltk.sem.rtuple(r, lcon=True, rcon=True))