pyltp

最新推荐文章于 2025-06-16 13:57:22 发布
原创最新推荐文章于 2025-06-16 13:57:22 发布 · 1.3k 阅读
CC 4.0 BY-SA版权
文章标签：
# -*- coding: utf-8 -*-
import os

from numpy import *

from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer


LTP_DATA_DIR = 'E:\model\ltp_data'  # ltp模型目录的路径
OUTSIDE_DIC_DIR = 'E:\model\lexicon.txt'  # 外部字典路径
OUTSIDE_ENTITY_ND_DIR = 'E:\model\ltp_data\entity_nd.txt'  # 外部实体模型路径
OUTSIDE_ENTITY_NJ_DIR = 'E:\model\ltp_data\entity_nj.txt'

# 模型加载
segmentor = Segmentor()  # 分词模型
segmentor.load_with_lexicon(os.path.join(LTP_DATA_DIR, 'cws.model'), OUTSIDE_DIC_DIR)

postagger = Postagger()  # 词性模型
postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))

recognizer = NamedEntityRecognizer()  # 实体模型
recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))


# 分句
def sentence_splitter(sentences):
    sentence_list = SentenceSplitter.split(sentences)
    return sentence_list


# 分词
def segment(sentence):
    words = segmentor.segment(sentence)
    words_list = list(words)
    return words_list


# 词性标注
def pos_tag(words):
    pos_tags = postagger.postag(words)  # 词性标注
    return pos_tags


# 加载外部实体模型
def load_outside_entity():
    try:
        nd_in_file = open(OUTSIDE_ENTITY_ND_DIR, 'r', encoding='UTF-8')
        nj_in_file = open(OUTSIDE_ENTITY_NJ_DIR, 'r', encoding='UTF-8')
        nd_text_line = nd_in_file.readline()
        nj_text_line = nj_in_file.readline()
        list_nd = []
        list_nj = []

        while nd_text_line:
            list_nd.append(nd_text_line.replace('\n', ''))
            nd_text_line = nd_in_file.readline()

        while nj_text_line:
            list_nj.append(nj_text_line.replace('\n', ''))
            nj_text_line = nj_in_file.readline()
    except Exception as e:
        print(e)

    finally:
        nd_in_file.close()
        nj_in_file.close()

    return list_nd, list_nj


# 问题抽象
def question_extraction(sentence, list_nd, list_nj):
    try:
        words = segment(sentence)
        print('分词结果：', words)
        word_index = 0
        for word in words:
            if word in list_nd:
                words[word_index] = 'Nd'
            elif word in list_nj:
                words[word_index] = 'Nj'

            word_index += 1
    except Exception as e:
        print(e)
        return

    return words


# 加载问题模板
def load_data(in_file_name):
    try:
        in_file = open(in_file_name, 'r', encoding='UTF-8')
        text_line = in_file.readline()
        posting_list = []
        class_vec = []
        dic = {}
        dic_index = {}
        line_index = 0
        while text_line:
            posting_list.append(text_line.split(';')[0].split(' '))
            que = text_line.split(';')[1].replace('\n', '')
            class_vec.append(que)
            dic.setdefault(que, 'p' + str(line_index))
            dic_index.setdefault(line_index, que)
            line_index += 1
            text_line = in_file.readline()
    except Exception as e:
        print(e)
        return

    finally:
        in_file.close()

    return posting_list, class_vec, dic, dic_index


# 创建一个包含在所有文档中出现的不重复词的列表
def create_vocab_list(data_set):
    vocab_set = set([])
    for document in data_set:
        vocab_set = vocab_set | set(document)
    return list(vocab_set)


# 将文档词条转换成词向量
def set_words2vec(vocab_list, input_set):
    return_vec = [0]*len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] += 1
    return return_vec


# 朴素贝叶斯分类器训练函数   从词向量计算概率
def train(train_matrix, train_category, dic):
    try:
        num_train_docs = len(train_matrix)
        num_words = len(train_matrix[0])
        p_model = 1/float(num_train_docs)  # as the question model is exclusive and independent, the p_model is not necessary

        dic_matrix = {}
        dic_num = {}
        dic_vec = {}
        for i in range(num_train_docs):
            dic_matrix['p' + str(i) + '_num'] = ones(num_words)  # 避免一个概率值为0,最后的乘积也为0  用来统计两类数据中，各词的词频
            dic_num['p' + str(i) + '_denom'] = num_train_docs

        for i in range(num_train_docs):
            dic_matrix[dic.get(train_category[i]) + '_num'] += train_matrix[i]
            dic_num[dic.get(train_category[i]) + '_denom'] += sum(train_matrix[i])

        # 避免下溢出或者浮点数舍入导致的错误
        for i in range(num_train_docs):
            dic_vec[i] = log(dic_matrix['p' + str(i) + '_num'] / dic_num['p' + str(i) + '_denom'])

    except Exception as e:
        print(e)
        return

    return dic_vec, p_model


# 朴素贝叶斯分类器
def classify_nb(vec2classify, dic_vec, dic_index, p_model):
    try:
        dic_var = {}
        for key in dic_vec:
            dic_var.setdefault(key, sum(vec2classify*dic_vec.get(key)))

        max_zip_dic = max(zip(dic_var.values(), dic_var.keys()))
    except Exception as e:
        print(e)
        return

    return dic_index.get(max_zip_dic[1])


"""
'在中国上映的但丁密码的票房怎么样？'
'电影但丁密码的导演是谁？'
'李根在但丁密码中饰演的角色是谁？'
"""
if __name__ == "__main__":
    try:
        list_nd, list_nj = load_outside_entity()
        sentence = '在中国上映的环太平洋:雷霆再起怎么样？'
        print('原句：', sentence)
        word_list = question_extraction(sentence, list_nd, list_nj)
        print('问题抽象结果：', word_list)

        in_file_name = "input.txt"
        list_posts, list_classes, dic_vocab, dic_index = load_data(in_file_name)
        my_vocab_list = create_vocab_list(list_posts)

        train_mat = []
        for post_in_doc in list_posts:
            train_mat.append(set_words2vec(my_vocab_list, post_in_doc))
        dic_vec, p_model = train(array(train_mat), array(list_classes), dic_vocab)

        this_doc = array(set_words2vec(my_vocab_list, word_list))
        print('问题分类结果：', classify_nb(this_doc, dic_vec, dic_index, p_model))
    except Exception as e:
        print(e)