''' 1、读取文档 2、对要计算的多篇文档进行分词 3、对文档进行整理成指定格式,方便后续进行计算 4、计算出词语的频率 5【可选】、对频率低的词语进行过滤 6、通过语料库建立词典 7、加载要对比的文档 8、将要对比的文档通过doc2bow转化为稀疏向量 9、对稀疏向量进行进一步处理,得到新语料库 10、将新语料库通过tfidfmodel进行处理,得到tfidf 11、通过token2id得到特征数 12、稀疏矩阵相似度,从而建立索引 13、得到最终相似度结果 ''' from gensim import corpora,models,similarities from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS import jieba import matplotlib.pyplot as plt from PIL import Image import numpy as np import urllib.request # d1=urllib.request.urlopen("ljm.html").read() # d2=urllib.request.urlopen("gcd.html").read() d1=open("ljm.html","r",encoding="utf-8").read() d2=open("gcd.html","r",encoding="utf-8").read() data1=jieba.cut(d1) data2=jieba.cut(d2) data11="" for item in data1: data11+=item+" " data21="" for item in data2: data21+=item+" " documents=[data11,data21] texts=[[word for word in document.split()] for document in documents] print(texts[0]) print("======================") print(texts[1]) from collections import defaultdict frequency=defaultdict(int) for text in texts: for token in text: frequency[token]+=1 texts=[[token for token in text if frequency[token]>25] for text in texts] dictionary=corpora.Dictionary(texts) dictionary.save('12345.txt') doc3="d3.txt" # d3=urllib.request.urlopen("dmbj.html").read().decode("utf-8","ignore") d3=open("dmbj.html","r",encoding="utf-8").read() data3=jieba.cut(d3) data31="" for item in data3: data31+=item+" " new_doc=data31 new_vec=dictionary.doc2bow(new_doc.split()) corpus=[dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('6562.txt',corpus) tfidf=models.TfidfModel(corpus) featureNum=len(dictionary.token2id.keys()) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=featureNum) sims=index[tfidf[new_vec]] print(sims)
计算分本相似度jieba ,wordcloud ,gensim
最新推荐文章于 2024-06-13 18:38:02 发布