【头歌实践教学-NLP】第六章：句法分析_头歌基于pcfg的cyk算法实现-优快云博客

本文链接：https://blog.youkuaiyun.com/qq_59392060/article/details/147615340

第六章：句法分析

基于PCFG的CYK算法实现
- 第1关：基于PCFG实现CYK算法
句法分析的常用方法与实战
- 第1关：Pyhanlp 的使用
- 第2关：句法分析的常用方法与实战
句法分析概述
- 第1关：句法分析概述

基于PCFG的CYK算法实现

第1关：基于PCFG实现CYK算法

import math
from grammar import Pcfg


class CkyParser(object):
    """
    A CKY parser.
    """

    def __init__(self, grammar):
        """
        Initialize a new parser instance from a grammar. 
        """
        self.grammar = grammar

    def parse_with_backpointers(self, tokens):
        """
        Parse the input tokens and return a parse table and a probability table.
        """
        n = len(tokens)
        pi = dict()  # 回溯指针表，记录非终结符的分割方式
        probs = dict()  # 概率表，记录非终结符的最大概率
        # 初始化表格：所有(i,j)位置的字典
        for i in range(n + 1):
            for j in range(i + 1, n + 1):
                pi[(i, j)] = dict()
                probs[(i, j)] = dict()

        # 处理长度为1的跨度（终结符）
        for i, word in enumerate(tokens):
            # 遍历所有右部为终结符的规则（如A->word）
            for key, values in self.grammar.rhs_to_rules.items():
                if len(key) == 1 and word == key[0]:  # 仅处理一元规则（终结符）
                    for items in values:
                        non_terminal = items[0]  # 左部非终结符A
                        # 记录A在(i,i+1)处的回溯指针（直接指向词）和概率
                        pi[(i, i + 1)][non_terminal] = word
                        probs[(i, i + 1)][non_terminal] = math.log(items[2])  # 对数概率

        # 处理长度≥2的跨度（非终结符组合）
        for length in range(2, n + 1):
            for i in range(n - length + 1):
                j = i + length  # 当前跨度结束位置
                # 遍历所有可能的分割点k（i < k < j）
                for k in range(i + 1, j):
                    # 遍历所有右部为二元非终结符的规则（如A->BC）
                    for key, values in self.grammar.rhs_to_rules.items():
                        if len(key) != 2:
                            continue  # 仅处理二元规则（非终结符组合）
                        B, C = key  # 规则右部的两个非终结符B和C
                        # 检查B是否存在于(i,k)的非终结符中，C是否存在于(k,j)的非终结符中
                        if B in pi[(i, k)] and C in pi[(k, j)]:
                            for items in values:
                                A = items[0]  # 规则左部非终结符A
                                rule_prob = math.log(items[2])  # 规则A->BC的对数概率
                                # 当前路径的总概率 = 规则概率 + B在(i,k)的概率 + C在(k,j)的概率
                                current_prob = rule_prob + probs[(i, k)][B] + probs[(k, j)][C]
                                
                                # 若A在(i,j)处已有记录，比较概率并保留更大值
                                if A in pi[(i, j)]:
                                    # 任务：保留概率更大的分割方式
                                    # ********** Begin **********#
                                    # 比较当前概率与已记录的概率
                                    if current_prob > probs[(i, j)][A]:
                                        # 更新回溯指针为当前的B和C分割方式
                                        pi[(i, j)][A] = ((B, i, k), (C, k, j))
                                        # 更新概率为当前更大值
                                        probs[(i, j)][A] = current_prob
                                    # **********  End  **********#
                                else:
                                    # 若A在(i,j)处无记录，直接记录
                                    pi[(i, j)][A] = ((B, i, k), (C, k, j))
                                    probs[(i, j)][A] = current_prob
        return pi, probs


def get_tree(chart, i, j, nt):
    """
    Return the parse-tree rooted in non-terminal nt and covering span i,j.
    """
    if type(chart[(i, j)][nt]) == str:
        return (nt, chart[(i, j)][nt])
    left_child = chart[(i, j)][nt][0]
    right_child = chart[(i, j)][nt][1]
    return (nt, 
            (get_tree(chart, left_child[1], left_child[2], left_child[0])),
            (get_tree(chart, right_child[1], right_child[2], right_child[0]))
           )


# if __name__ == "__main__":
#     with open('/data/workspace/myshixun/src/atis3.pcfg', 'r') as grammar_file:
#         grammar = Pcfg(grammar_file)
#         parser = CkyParser(grammar)
#         toks = ['flights', 'from', 'miami', 'to', 'cleveland', '.']
#         table, probs = parser.parse_with_backpointers(toks)
#         tree = get_tree(table, 0, len(toks), grammar.startsymbol)  # dynamic programming
#         print(tree)
#         # ('TOP', ('NP', ('NP', 'flights'), ('NPBAR', ('PP', ('FROM', 'from'), ('NP', 'miami')), ('PP', ('TO', 'to'), ('NP', 'cleveland')))), ('PUN', '.'))

句法分析的常用方法与实战

第1关：Pyhanlp 的使用

from pyhanlp import HanLP
text = input()
# 使用HanLP提取前两个关键词
keywords = HanLP.extractKeyword(text, 2)
# 格式化为要求的输出形式
print(f"[{', '.join(keywords)}]")

第2关：句法分析的常用方法与实战

from pyhanlp import HanLP

text=input()
# ********** Begin *********#
sentence = HanLP.parseDependency(text) # 对 text 进行句法分析
for sen in sentence.iterator():
    print("%s --(%s)--> %s"%(sen.LEMMA,sen.DEPREL,sen.HEAD.LEMMA))
 
 
# ********** End **********#