自热语言处理中的文法分析

       声明:代码的运行环境为Python3。Python3与Python2在一些细节上会有所不同,希望广大读者注意。本博客以代码为主,代码中会有详细的注释。相关文章将会发布在我的个人博客专栏《Python自然语言处理》,欢迎大家关注。


       文法在自然语言处理中可以提高处理的准确度,属于很关键的一环。

'''
文法分析
'''

import nltk

# 文法
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V AP | V NP
V -> '是'|'走在'|'进入'
AP -> '很抽象的'
NP -> '北京'|'哈尔滨'|'形式语言'|'中国'|'教育'|'集合'|'WTO'|'美丽的城市'|'祖国的首都'|'数学的基础'|'社会发展的前面'
""")

sent = [u'集合', u'是', u'数学的基础']
parser = nltk.ChartParser(grammar)
trees = parser.parse(sent)
for tree in trees:
    print(tree)
    tree.draw()

# 普遍存在的歧义
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det NP | Det AP N | AP N  | N
VP -> V AP | V NP
N -> '校长'|'座谈会'
Det -> '三个'
AP -> '学校的'
V -> '参加'
""")

sent = [u'三个', u'学校的', u'校长', u'参加', u'座谈会']
parser = nltk.ChartParser(groucho_grammar)
trees = parser.parse(sent)
for tree in trees:
    tree.draw()

# 上下文无关文法
grammar1 = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V NP PP
PP -> P NP
V -> "saw" | "ate" | "walked"
NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
Det -> "a" | "an" | "the" | "my"
N -> "man" | "dog" | "cat" | "telescope" | "park"
P -> "in" | "on" | "by" | "with"
""")
sent = "Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
    print(tree)

sent = "the dog saw a man in the park".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
    tree.draw()

# 递归的上下文无关文法
grammar2 = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det Nom | PropN
Nom -> Adj Nom | N
VP -> V Adj | V NP | V S | V NP PP
PP -> P NP
PropN -> 'Buster' | 'Chatterer' | 'Joe'
Det -> 'the' | 'a'
N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'
Adj -> 'angry' | 'frightened' | 'little' | 'tall'
V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put'
P -> 'on'
""")
sent = "the angry bear chased the frightened little squirrel".split()
rd_parser = nltk.RecursiveDescentParser(grammar2)
for tree in rd_parser.parse(sent):
    tree.draw()

# 递归下降解析器
rd_parser = nltk.RecursiveDescentParser(grammar2)
sent = 'Joe saw a bear'.split()
for t in rd_parser.parse(sent):
    print(t)
# NP -> NP PP

# 移进-归约解析器
sr_parse = nltk.ShiftReduceParser(grammar2, trace=2)
sent = 'Joe saw a bear'.split()
for t in sr_parse.parse(sent):
    print(t)

# 图表分析
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")

text = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']


def init_wfst(tokens, grammar):
    numtokens = len(tokens)
    wfst = [[None for i in range(numtokens + 1)] for j in range(numtokens + 1)]
    for i in range(numtokens):
        productions = grammar.productions(rhs=tokens[i])
        wfst[i][i + 1] = productions[0].lhs()
    return wfst


def complete_wfst(wfst, tokens, grammar, trace=False):
    index = dict((p.rhs(), p.lhs()) for p in grammar.productions())
    numtokens = len(tokens)
    for span in range(2, numtokens + 1):
        for start in range(numtokens + 1 - span):
            end = start + span
            for mid in range(start + 1, end):
                nt1, nt2 = wfst[start][mid], wfst[mid][end]
                if nt1 and nt2 and (nt1, nt2) in index:
                    wfst[start][end] = index[(nt1, nt2)]
                    if trace:
                        print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" % (
                        start, nt1, mid, nt2, end, start, index[(nt1, nt2)], end))
    return wfst


def display(wfst, tokens):
    print('\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))]))
    for i in range(len(wfst) - 1):
        print("%d " % i,)
        for j in range(1, len(wfst)):
            print("%-4s" % (wfst[i][j] or '.'),)
        print()


tokens = "I shot an elephant in my pajamas".split()
wfst0 = init_wfst(tokens, groucho_grammar)
display(wfst0, tokens)

wfst1 = complete_wfst(wfst0, tokens, groucho_grammar)
display(wfst1, tokens)

wfst1 = complete_wfst(wfst0, tokens, groucho_grammar, trace=True)

# 依存关系
groucho_dep_grammar = nltk.DependencyGrammar.fromstring("""
'shot' -> 'I' | 'elephant' | 'in'
'elephant' -> 'an' | 'in'
'in' -> 'pajamas'
'pajamas' -> 'my'
""")
print(groucho_dep_grammar)

pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)
sent = 'I shot an elephant in my pajamas'.split()
trees = pdp.parse(sent)
for tree in trees:
    print(tree)

# 文法开发
from nltk.corpus import treebank

t = treebank.parsed_sents('wsj_0001.mrg')[0]
print(t)


def filter(tree):
    child_nodes = [child.label() for child in tree
                   if isinstance(child, nltk.Tree)]
    return (tree.label() == 'VP') and ('S' in child_nodes)


from nltk.corpus import treebank

print([subtree for tree in treebank.parsed_sents()
 for subtree in tree.subtrees(filter)])

from collections import defaultdict

entries = nltk.corpus.ppattach.attachments('training')
table = defaultdict(lambda: defaultdict(set))
for entry in entries:
    key = entry.noun1 + '-' + entry.prep + '-' + entry.noun2
    table[key][entry.attachment].add(entry.verb)

for key in sorted(table):
    if len(table[key]) > 1:
        print(key, 'N:', sorted(table[key]['N']), 'V:', sorted(table[key]['V']))

nltk.corpus.sinica_treebank.parsed_sents()[3450].draw()

grammar = nltk.CFG.fromstring("""
S -> NP V NP
NP -> NP Sbar
Sbar -> NP V
NP -> 'fish'
V -> 'fish'
""")

tokens = ["fish"] * 5
cp = nltk.ChartParser(grammar)
for tree in cp.parse(tokens):
    print(tree)

tokens = ["fish"] * 7
cp = nltk.ChartParser(grammar)
for tree in cp.parse(tokens):
    print(tree)

tokens = ["fish"] * 9
cp = nltk.ChartParser(grammar)
for tree in cp.parse(tokens):
    print(tree)


# 加权文法
def give(t):
    return t.label() == 'VP' and len(t) > 2 and t[1].label() == 'NP' \
           and (t[2].label() == 'PP-DTV' or t[2].label() == 'NP') \
           and ('give' in t[0].leaves() or 'gave' in t[0].leaves())


def sent(t):
    return ' '.join(token for token in t.leaves() if token[0] not in '*-0')


def print_node(t, width):
    output = "%s %s: %s / %s: %s" % \
             (sent(t[0]), t[1].label(), sent(t[1]), t[2].label(), sent(t[2]))
    if len(output) > width:
        output = output[:width] + "..."
    print(output)


for tree in nltk.corpus.treebank.parsed_sents():
    for t in tree.subtrees(give):
        print_node(t, 72)


        # 概率上下文无关文法
grammar = nltk.PCFG.fromstring("""
    S    -> NP VP              [1.0]
    VP   -> TV NP              [0.4] 
    VP   -> IV                 [0.3]
    VP   -> DatV NP NP         [0.3]
    TV   -> 'saw'              [1.0]
    IV   -> 'ate'              [1.0]
    DatV -> 'gave'             [1.0]
    NP   -> 'telescopes'       [0.8]
    NP   -> 'Jack'             [0.2]
    """)
print(grammar)

viterbi_parser = nltk.ViterbiParser(grammar)
for tree in viterbi_parser.parse(['Jack', 'saw', 'telescopes']):
    print(tree)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

象在舞

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值