python lexical chain

github地址

import nltk
nltk.download('wordnet_ic')
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

threshold = 0.6  # treshold for wup
jcnTreshold = 0.09  # jcn
pathTeshold = 0.1  # path
brown_ic = wordnet_ic.ic('ic-brown.dat')  # load the brown corpus
lexical_chains = []  # empty list to hold all the chains
dictionary = {}  # empty dictionart to hold the count of each word encountered


class Chain():
    def __init__(self, words, senses, count=0):
        self.words = set(words)
        self.senses = set(senses)
        dictionary[words[0]] = 1  # initialize counter

    def addWord(self, word):

        if (len(self.words.intersection([word])) > 0):
            dictionary[word] += 1
        else:
            dictionary[word] = 1

        self.words.add(word)

    def addSense(self, sense):
        self.senses.add(sense)

    def getWords(self):
        return self.words

    def getSenses(self):
        return self.getSenses

    def incCount(self):
        self.count += 1


def add_word(word):
    maximum = 0
    maxJCN = 0
    flag = 0
    for chain in lexical_chains:  # for all chains that are present
        for synset in wn.synsets(word):  # for all synsets of current word
            for sense in chain.senses:  # for all senses of the current word in current element of the current chain
                similarity = sense.wup_similarity(synset)  # using wup_similarity

                if (similarity >= maximum):
                    if similarity >= threshold:
                        # print word, synset, sense, sense.jcn_similarity(synset, brown_ic)
                        JCN = sense.jcn_similarity(synset, brown_ic)  # using jcn_similarity
                        if JCN >= jcnTreshold:
                            if sense.path_similarity(synset) >= 0.2:  # using path similarity
                                if JCN >= maxJCN:
                                    maximum = similarity
                                    maxJCN = JCN
                                    maxChain = chain
                                    flag = 1
    if flag == 1:
        maxChain.addWord(word)
        maxChain.addSense(synset)
        return

    lexical_chains.append(Chain([word], wn.synsets(word)))


fileName = input("Enter file path + name, if file name is 'nlp.txt', type 'nlp' \n \n")
fileName += ".txt"
print("\n\n")
# fileName = "nlp.txt" #输入文件名
File = open(fileName)  # open file
lines = File.read()  # read all lines

#is_noun = lambda x: True if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') else False
nouns = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(lines)) if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]  # extract all nouns

print(nouns) 
#输出所以名词
#NN     Noun, singular or mass 常用名词 单数形式
#NNS     Noun, plural  常用名词 复数形式
#NNP     Proper noun, singular  专有名词,单数形式
#NNPS     Proper noun, plural  专有名词,复数形式

for word in nouns:
    add_word(word)

# print all chains
for chain in lexical_chains:
    #print(chain)
    print(", ".join(str(word + "(" + str(dictionary[word]) + ")") for word in chain.getWords()))

from collections import Counter
result = dict(Counter(nouns))
print(result)
print ([key for key,value in result.items() if value > 1])
print ({key:value for key,value in result.items() if value > 1})




#输入文件名
#查找所有名词,并输出名词列表(有重复)

#源代码的词汇链

#聚合名词列表,并输出频率大于1的单词。
在这里插入图片描述
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值