名词短语块挖掘

博客围绕名词短语块挖掘展开,但具体内容缺失。名词短语块挖掘属于文本挖掘领域,可助力从文本中提取关键信息。
#encoding=utf8
import os,json,nltk,re
from jpype import *
from tokenizer import cut_hanlp
huanhang=set(['。','?','!','?'])
keep_pos="q,qg,qt,qv,s,t,tg,g,gb,gbc,gc,gg,gm,gp,mg,Mg,n,an,ude1,nr,ns,nt,nz,nb,nba,nbc,nbp,nf,ng,nh,nhd,o,nz,nx,ntu,nts,nto,nth,ntch,ntcf,ntcb,ntc,nt,nsf,ns,nrj,nrf,nr2,nr1,nr,nnt,nnd,nn,nmc,nm,nl,nit,nis,nic,ni,nhm,nhd"
keep_pos_nouns=set(keep_pos.split(","))
keep_pos_v="v,vd,vg,vf,vl,vshi,vyou,vx,vi,vn"
keep_pos_v=set(keep_pos_v.split(","))
keep_pos_p=set(['p','pbei','pba'])
merge_pos=keep_pos_p|keep_pos_v
keep_flag=set([':',',','?','。','!',';','、','-','.','!',',',':',';','?','(',')','(',')','<','>','《','》'])
drop_pos_set=set(['xu','xx','y','yg','wh','wky','wkz','wp','ws','wyy','wyz','wb','u','ud','ude1','ude2','ude3','udeng','udh'])

def getNodes(parent,model_tagged_file):               # 使用for循环遍历树
    text=''
    for node in parent:
        if type(node) is nltk.Tree:                   # 如果是NP或者VP的合并分词
            if node.label() == 'NP':   
                text+=''.join(node_child[0].strip() for node_child in node.leaves())+"/NP"+3*" "
            if node.label() == 'VP':
                text+=''.join(node_child[0].strip() for node_child in node.leaves())+"/VP"+3*" "
        else:                                         # 不是树的,就是叶子节点,我们直接表解词PP或者其他O
            if node[1] in keep_pos_p:
                text+=node[0].strip()+"/PP"+3*" "  
            if node[0] in huanhang : 
                text+=node[0].strip()+"/O"+3*" "                    
            if node[1] not in merge_pos:
                text+=node[0].strip()+"/O"+3*" "                             
            #print("hh")
    model_tagged_file.write(text+"\n")     

def grammer(sentence,model_tagged_file):#{内/f 训/v 师/ng 单/b 柜/ng}
    """
    input sentences shape like :[('工作', 'vn'), ('描述', 'v'), (':', 'w'), ('我', 'rr'), ('曾', 'd'), ('在', 'p')]
    """
                                # 定义名词块 “< >”:一个单元       “*”:匹配零次或多次    “+”:匹配一次或多次  “<ude1>?”: “的”出现零次或一次
    grammar1 = r"""NP:   
        {<m|mg|Mg|mq|q|qg|qt|qv|s|>*<a|an|ag>*<s|g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|o|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<f>?<ude1>?<g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|o|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+}
        {<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<cc>+<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+}
        {<m|mg|Mg|mq|q|qg|qt|qv|s|>*<q|qg|qt|qv>*<f|b>*<vi|v|vn|vg|vd>+<ude1>+<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+}
        {<g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<vi>?}
        VP:{<v|vd|vg|vf|vl|vshi|vyou|vx|vi|vn>+}   
        """                                     # 动词短语块
    cp = nltk.RegexpParser(grammar1)
    try :
        result = cp.parse(sentence)             # nltk的依存语法分析,输出是以grammer设置的名词块为单位的树
    except:
        pass
    else:
        getNodes(result,model_tagged_file)      # 使用 getNodes 遍历树【这个是使用for循环,上一个是使用栈动态添加】


def data_read():
    fout=open('nvp.txt','w',encoding='utf8')
    for line in open('text.txt','r',encoding='utf8'):    
        line=line.strip()
        grammer(cut_hanlp(line),fout)     # 先进行hanlp进行分词,在使用grammer进行合并短语
    fout.close()

if __name__=='__main__':
    data_read()
tokenizer.py:
#encoding=utf8
import os,gc,re,sys
from itertools import chain
from jpype import *
djclass_path="-Djava.class.path="+"/home/kuo/NLP/module"+os.sep+"hanlp"+os.sep+"hanlp-1.6.2.jar:"+"/home/kuo/NLP/module"+os.sep+"hanlp"
startJVM(getDefaultJVMPath(), "-Djava.class.path=/home/lhq/桌面/NLP_basis/hanlp/hanlp-1.7.3.jar:/home/lhq/桌面/NLP_basis/hanlp",
         "-Xms1g",
         "-Xmx1g")

Tokenizer = JClass('com.hankcs.hanlp.tokenizer.StandardTokenizer')
def to_string(sentence,return_generator=False):
    if return_generator:
        return (word_pos_item.toString().split('/') for word_pos_item in Tokenizer.segment(sentence))
    else:
        return [(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]   
def to_string_hanlp(sentence,return_generator=False):
    if return_generator:
        return (word_pos_item.toString().split('/') for word_pos_item in HanLP.segment(sentence))
    else:
        return [(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]      
def seg_sentences(sentence,with_filter=True,return_generator=False):  
    segs=to_string(sentence,return_generator=return_generator)
    if with_filter:
        g = [word_pos_pair[0] for word_pos_pair in segs if len(word_pos_pair)==2 and word_pos_pair[0]!=' ' and word_pos_pair[1] not in drop_pos_set]
    else:
        g = [word_pos_pair[0] for word_pos_pair in segs if len(word_pos_pair)==2 and word_pos_pair[0]!=' ']
    return iter(g) if return_generator else g
def cut_hanlp(raw_sentence,return_list=True):
    if len(raw_sentence.strip())>0:
        return to_string(raw_sentence) if return_list else iter(to_string(raw_sentence))


 

评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值