pyltp

# -*- coding: utf-8 -*-
import os

from numpy import *

from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer


LTP_DATA_DIR = 'E:\model\ltp_data'  # ltp模型目录的路径
OUTSIDE_DIC_DIR = 'E:\model\lexicon.txt'  # 外部字典路径
OUTSIDE_ENTITY_ND_DIR = 'E:\model\ltp_data\entity_nd.txt'  # 外部实体模型路径
OUTSIDE_ENTITY_NJ_DIR = 'E:\model\ltp_data\entity_nj.txt'

# 模型加载
segmentor = Segmentor()  # 分词模型
segmentor.load_with_lexicon(os.path.join(LTP_DATA_DIR, 'cws.model'), OUTSIDE_DIC_DIR)

postagger = Postagger()  # 词性模型
postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))

recognizer = NamedEntityRecognizer()  # 实体模型
recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))


# 分句
def sentence_splitter(sentences):
    sentence_list = SentenceSplitter.split(sentences)
    return sentence_list


# 分词
def segment(sentence):
    words = segmentor.segment(sentence)
    words_list = list(words)
    return words_list


# 词性标注
def pos_tag(words):
    pos_tags = postagger.postag(words)  # 词性标注
    return pos_tags


# 加载外部实体模型
def load_outside_entity():
    try:
        nd_in_file = open(OUTSIDE_ENTITY_ND_DIR, 'r', encoding='UTF-8')
        nj_in_file = open(OUTSIDE_ENTITY_NJ_DIR, 'r', encoding='UTF-8')
        nd_text_line = nd_in_file.readline()
        nj_text_line = nj_in_file.readline()
        list_nd = []
        list_nj = []

        while nd_text_line:
            list_nd.append(nd_text_line.replace('\n', ''))
            nd_text_line = nd_in_file.readline()

        while nj_text_line:
            list_nj.append(nj_text_line.replace('\n', ''))
            nj_text_line = nj_in_file.readline()
    except Exception as e:
        print(e)

    finally:
        nd_in_file.close()
        nj_in_file.close()

    return list_nd, list_nj


# 问题抽象
def question_extraction(sentence, list_nd, list_nj):
    try:
        words = segment(sentence)
        print('分词结果:', words)
        word_index = 0
        for word in words:
            if word in list_nd:
                words[word_index] = 'Nd'
            elif word in list_nj:
                words[word_index] = 'Nj'

            word_index += 1
    except Exception as e:
        print(e)
        return

    return words


# 加载问题模板
def load_data(in_file_name):
    try:
        in_file = open(in_file_name, 'r', encoding='UTF-8')
        text_line = in_file.readline()
        posting_list = []
        class_vec = []
        dic = {}
        dic_index = {}
        line_index = 0
        while text_line:
            posting_list.append(text_line.split(';')[0].split(' '))
            que = text_line.split(';')[1].replace('\n', '')
            class_vec.append(que)
            dic.setdefault(que, 'p' + str(line_index))
            dic_index.setdefault(line_index, que)
            line_index += 1
            text_line = in_file.readline()
    except Exception as e:
        print(e)
        return

    finally:
        in_file.close()

    return posting_list, class_vec, dic, dic_index


# 创建一个包含在所有文档中出现的不重复词的列表
def create_vocab_list(data_set):
    vocab_set = set([])
    for document in data_set:
        vocab_set = vocab_set | set(document)
    return list(vocab_set)


# 将文档词条转换成词向量
def set_words2vec(vocab_list, input_set):
    return_vec = [0]*len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] += 1
    return return_vec


# 朴素贝叶斯分类器训练函数   从词向量计算概率
def train(train_matrix, train_category, dic):
    try:
        num_train_docs = len(train_matrix)
        num_words = len(train_matrix[0])
        p_model = 1/float(num_train_docs)  # as the question model is exclusive and independent, the p_model is not necessary

        dic_matrix = {}
        dic_num = {}
        dic_vec = {}
        for i in range(num_train_docs):
            dic_matrix['p' + str(i) + '_num'] = ones(num_words)  # 避免一个概率值为0,最后的乘积也为0  用来统计两类数据中,各词的词频
            dic_num['p' + str(i) + '_denom'] = num_train_docs

        for i in range(num_train_docs):
            dic_matrix[dic.get(train_category[i]) + '_num'] += train_matrix[i]
            dic_num[dic.get(train_category[i]) + '_denom'] += sum(train_matrix[i])

        # 避免下溢出或者浮点数舍入导致的错误
        for i in range(num_train_docs):
            dic_vec[i] = log(dic_matrix['p' + str(i) + '_num'] / dic_num['p' + str(i) + '_denom'])

    except Exception as e:
        print(e)
        return

    return dic_vec, p_model


# 朴素贝叶斯分类器
def classify_nb(vec2classify, dic_vec, dic_index, p_model):
    try:
        dic_var = {}
        for key in dic_vec:
            dic_var.setdefault(key, sum(vec2classify*dic_vec.get(key)))

        max_zip_dic = max(zip(dic_var.values(), dic_var.keys()))
    except Exception as e:
        print(e)
        return

    return dic_index.get(max_zip_dic[1])


"""
'在中国上映的但丁密码的票房怎么样?'
'电影但丁密码的导演是谁?'
'李根在但丁密码中饰演的角色是谁?'
"""
if __name__ == "__main__":
    try:
        list_nd, list_nj = load_outside_entity()
        sentence = '在中国上映的环太平洋:雷霆再起怎么样?'
        print('原句:', sentence)
        word_list = question_extraction(sentence, list_nd, list_nj)
        print('问题抽象结果:', word_list)

        in_file_name = "input.txt"
        list_posts, list_classes, dic_vocab, dic_index = load_data(in_file_name)
        my_vocab_list = create_vocab_list(list_posts)

        train_mat = []
        for post_in_doc in list_posts:
            train_mat.append(set_words2vec(my_vocab_list, post_in_doc))
        dic_vec, p_model = train(array(train_mat), array(list_classes), dic_vocab)

        this_doc = array(set_words2vec(my_vocab_list, word_list))
        print('问题分类结果:', classify_nb(this_doc, dic_vec, dic_index, p_model))
    except Exception as e:
        print(e)

### 如何在 MacOS 上安装和使用 PyLTP #### 准备工作 为了顺利安装 PyLTP,在 Mac 系统上建议先准备好 Python 的开发环境。如果尚未配置好 Python 开发环境,可以考虑通过 Anaconda 创建虚拟环境来简化依赖管理[^4]。 #### 获取 PyLTP 源码 可以通过 Git 克隆官方仓库获取最新的源代码: ```bash git clone https://github.com/HIT-SCIR/pyltp.git ``` 进入克隆下来的 `pyltp` 文件夹内准备后续操作: ```bash cd pyltp ``` 由于某些版本可能存在缺失的子模块问题,需要额外拉取 LTP 库并放置于正确位置: ```bash rm -rf ltp # 删除可能存在的旧版空目录 mkdir ltp && cd ltp git clone https://github.com/HIT-SCIR/ltp.git . cd .. ``` #### 修改 Setup 配置适应 macOS 版本 对于特定版本的 macOS (如 10.14),需调整编译参数以匹配操作系统需求。编辑 `setup.py` 文件中的相关设置,确保其兼容当前使用的 macOS 发行版: ```python os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.14' ``` 此步骤有助于解决因系统差异引发的编译失败问题[^5]。 #### 执行安装命令 完成上述准备工作后,即可尝试安装 PyLTP。考虑到权限因素,推荐采用超级用户模式运行安装脚本来规避潜在权限不足的问题: ```bash sudo python setup.py install ``` 这一步会触发 C++ 编译过程,期间可能会遇到 clang++ 报错的情况。如果是首次安装 Xcode Command Line Tools 或者更新过 Xcode 后忘记重新同意许可协议,则可能导致此类错误发生。此时应确认已正确安装 Xcode 工具链,并接受 EULA 协议后再试一次安装流程[^2]。 #### 使用 Pip 安装(可选) 除了从源码构建外,也可以直接利用 pip 来快速部署 PyLTP: ```bash pip3 install pyltp ``` 不过需要注意的是,这种方式有时也会遭遇编译器不兼容等问题,因此当遇到困难时应回归到基于源码的手动安装方法上来解决问题。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值