import gensim
from gensim import corpora,models
from gensim.corpora import Dictionary
from pyltp import Segmentor
corpus= [
'情况正发生着微妙的变化',
'给最需要钱的人',
'给最优秀的人',
'给导师最需要的人'
]
doc_list = []
def segment():
segmentor = Segmentor()
segmentor.load('/usr/local/ltp_data/cws.model')
for doc in corpus:
words = list(segmentor.segment(doc))
doc_list.append(words)
for words in doc_list:
print(words)
'''
['情况', '正', '发生', '着', '微妙', '的', '变化']
['给', '最', '需要', '钱', '的', '人']
['给', '最', '优秀', '的', '人']
['给', '导师', '最', '需要', '的', '人']
'''
def test_Dictionary():
dictionary = Dictionary(doc_list) #输出词表,输入分好词的文档
dictionary.save('test.dict') #保存到文件夹中
#dictionary = Dictionary.load('test.dict')
print(dictionary) #打印词表
'''
Dictionary(14 unique tokens: ['发生', '正', '情况', '着', '人']...)
'''
print(dictionary.token2id)
'''
{'发生': 0, '正': 4, '情况': 3, '着': 6, '人': 7, '钱': 10, '微妙': 2, '优秀': 12, '给': 9, '导师': 13, '的': 5, '需要': 11, '变化': 1, '最': 8}
'''
print(dictionary.get(12)) #由id索到词
'''
优秀
'''
new_split_doc = ['我','是','一个','优秀','优秀','的','人'] #新的已分词文档
the_vector_of_new_split_doc = dictionary.doc2bow(new_split_doc) #新文档的分词向量,不在词表的词不会有向量表示
print(the_vector_of_new_split_doc)
'''
[(5, 1), (7, 1), (12, 2)]
'''
corpus_vector = [dictionary.doc2bow(doc) for doc in doc_list] #将用于生成词表的文档,用于生成词向量
for doc_vec in corpus_vector:
print(doc_vec) #输出每个文档的词向量 (词id,词的频数)
'''
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
[(5, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]
[(5, 1), (7, 1), (8, 1), (9, 1), (12, 1)]
[(5, 1), (7, 1), (8, 1), (9, 1), (11, 1), (13, 1)]
'''
corpora.MmCorpus.serialize('corpus_vector.mm', corpus_vector) #保存文档向量
# corpora.MmCorpus.serialize('corpuse.mm',corpus)#保存生成的语料
# corpus=corpora.MmCorpus('corpuse.mm')#加载
def tf_idf():
corpus=corpora.MmCorpus('corpus_vector.mm')#加载
dictionary = Dictionary.load('test.dict') #加载字典
tfidf_model = models.TfidfModel(corpus,id2word=dictionary) #训练IDF模型
print(tfidf_model)
'''
TfidfModel(num_docs=4, num_nnz=24)
'''
print(tfidf_model.dfs)
'''
{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 4, 6: 1, 7: 3, 8: 3, 9: 3, 10: 1, 11: 2, 12: 1, 13: 1}
'''
doc_set_tfidf = tfidf_model[corpus] #使用IDF模型
print(doc_set_tfidf) #得到TF-IDF值
'''
<gensim.interfaces.TransformedCorpus object at 0x7f50600d21d0>
'''
for doc_tfidf in doc_set_tfidf:
print(doc_tfidf)
'''
[(0, 0.4082482904638631), (1, 0.4082482904638631), (2, 0.4082482904638631), (3, 0.4082482904638631), (4, 0.4082482904638631), (6, 0.4082482904638631)]
[(7, 0.17670342298442518), (8, 0.17670342298442518), (9, 0.17670342298442518), (10, 0.8515058195534599), (11, 0.42575290977672997)]
[(7, 0.1952870421339958), (8, 0.1952870421339958), (9, 0.1952870421339958), (12, 0.9410573380637679)]
[(7, 0.17670342298442518), (8, 0.17670342298442518), (9, 0.17670342298442518), (11, 0.42575290977672997), (13, 0.8515058195534599)]
'''
topK=5
print()
tfidf_model.save("data.tfidf") #保存idf模型
new_tfidf_model = models.TfidfModel.load('data.tfidf') #导入idf模型
doc_set_tfidf2 = new_tfidf_model[corpus]
for doc_tfidf in doc_set_tfidf:
keys = []
doc_tfidf = sorted(doc_tfidf,reverse = True,key = lambda x:(x[1],x[0])) # 从大到小排列
for i in range(topK if topK<len(doc_tfidf) else len(doc_tfidf)):
keys.append(dictionary.get(doc_tfidf[i][0]))
keys = ' '.join(keys)
print(keys)
'''
着 正 情况 微妙 变化
钱 需要 给 最 人
优秀 给 最 人
导师 需要 给 最
'''
if __name__ == '__main__':
segment()
print()
test_Dictionary()
print()
tf_idf()
gensim 试用小结
最新推荐文章于 2022-04-08 11:11:01 发布