#encoding:utf8
import re
import os
import jieba
import jieba.posseg as pseg
import numpy as np
class TFIDFCounter:
#构造方法
def __init__(self):
#对象属性
self.tfs = {}#每篇文章的每个词的词频{"docid1":{'苹果':tf值, '乔布斯':tf值,.....},"docid2":{'手机':tf值, '乔布斯':tf值,.....}}
self.tfidfs = {}#每篇文章的每个词的tfidf值{"docid1":{'苹果':tfidf值, '乔布斯':tfidf值,.....},"docid2":{'手机':tfidf值, '乔布斯':tfidf值,.....}}
self.idfs = {}#语料词典中每个词的idf值{'苹果':idf值, '乔布斯':idf值,"手机":idf值, .....}
self.stopwordsPath = r'C:\Users\adcar\PycharmProjects\pythonProject2\NLP\datas\stopwords.txt'
def load_data(self, filepath):##导入数据
with open(filepath, 'r', encoding='gbk') as fin:
content = fin.read()
return re.sub(r'[\n]', '', content)##按行读取
#添加语料
def add(self, docid, content):
self.termset = set()###set集合为了避免重复
#分词
#document = jieba.lcut(content)#得到分词列表 #根据词性过滤
psresult = pseg.cut(content)
#根据词性过滤-去掉标点符号
document = [x.word for x in psresult if x.flag not in ['x', 'm'] and len(x.word) > 1 ]
print(document)
#停用词过滤
stopwords = self.get_stopwords()
temp = document.copy()
for word in temp:
if word in stopwords:
document.remove(word)
#添加到语料词典中
self.termset.update(document)
#计算统计每篇文档的 每个词的词频
self.getTf(docid, document)
#加载停用词
def get_stopwords(self):
stopwordset = set()
with open(filepath, 'r', encoding='gbk') as fin:
for line in fin:
stopwordset.add(line.strip())
return stopwordset
#计算与统计
def computer(self):
#1.gettf()
#2.getidf()
#3.gettfidf()
pass
#计算tf值
def getTf(self, docid, document):
#文章的每个词的词频统计
#公式:每个词在该文档中出现的次数/该文档的总词数
total = len(document)
doc_tfs = {}#{'苹果':tf值, '乔布斯':tf值,.....}
if total != 0 :
for word in document:
count = document.count(word)
word_tf = count/total
doc_tfs.update({word : word_tf})
self.tfs.update({docid: doc_tfs})
#计算idf值
def getIdf(self):
#{"docid1":{'苹果':tf值, '乔布斯':tf值,.....},"docid2":{'手机':tf值, '乔布斯':tf值,.....}}
#公式:log(语料中文档总数/(包含该词的文档数+1))
#语料词典中每个词的idf值{'苹果':idf值, '乔布斯':idf值,"手机":idf值, .....}
total = len(self.tfs)#语料中文档总数
for word in self.termset:
count = 1
for document in self.tfs.values():
if word in document.keys():#keys values items
count += 1
word_idf = np.log(total/count)
self.idfs.update({word: word_idf})
#计算tfidf值
def getTfIdf(self):
#公式:tf*idf
#每篇文章的每个词的tfidf值{"docid1":{'苹果':tfidf值, '乔布斯':tfidf值,.....},"docid2":{'手机':tfidf值, '乔布斯':tfidf值,.....}}
for docid, document in self.tfs.items():
doc_tfidfs = {}
for word in document.keys():
if word in self.termset:
word_tf = document.get(word)
word_idf = self.idfs.get(word)
word_tfidf = word_tf * word_idf
doc_tfidfs.update({word: word_tfidf})
self.tfidfs.update({docid: doc_tfidfs})
#提取指定文章的topn个关键词
def getKeywordsTopN(self, docid, topN):
pass
if __name__ == '__main__':
dirName = r'C:\Users\adcar\PycharmProjects\pythonProject2\NLP\datas\datas'
counter = TFIDFCounter()
#1.添加语料
for fileName in os.listdir(dirName):
filepath = os.path.join(dirName, fileName)
content = counter.load_data(filepath)
docid = re.sub(r'\.txt', '', fileName)
counter.add(docid, content)
counter.getIdf()
counter.getTfIdf()
print(counter.tfidfs.keys())
tfidf代码实现
最新推荐文章于 2025-06-11 20:07:28 发布