最近自己实现了一下tfidf,发现实现起来细节跟tfidf的公式还是不大一样,我这里把我的实现过程分享出来。
- 导入一些库和数据
import pandas as pd
import glob
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import math
from tqdm import tqdm
txtfiles=glob.glob('task5/*.txt')
list_data=[]
for txt_name in txtfiles:
print(txt_name)
with open (txt_name) as fin:
tokens = word_tokenize(fin.read())
print(tokens[:5])
list_data.append(tokens)
每个txt代表一个文档,txt里面就是英文文档哈。
- 去掉标点符号,停用词
import string
list_remove_punc=[]
for text_arr in tqdm(list_data): # remove punctuation
text_arr=' '.join(text_arr)
table = str.maketrans(dict.fromkeys(string.punctuation)) # OR {key: None for key in string.punctuation}
new_s = text_arr.translate(table)
list_remove_punc.append(new_s.split())
list_data=list_remove_punc
list_filtered_data=[]
for doc in tqdm(list_data): # remove stop words
filtered_sentence = []
for word in doc:
if str(word) not in set(stopwords.words('english')):
filtered_sentence.append(word)
list_filtered_data.append(filtered_sentence)
- 计算IDF
idf(t) = log(N/(df + 1))
df代表document frequency,表示该单词在所有文档中出现的次数,N表示文档的总数,t代表term
def word_count(word_list):
word_dict={}
for word in word_list:
if(word in word_dict):
word_dict[word]=word_dict[word]+1
else:
word_dict[word]=1
return word_dict
def get_vocab(list_filtered_data):
vocab=[]
for wordlist in list_filtered_data:
vocab+=wordlist
return set(vocab)
def computeIDF(vocab, allDocuments):
idfDict = {}
idfDict = dict.fromkeys(list(vocab), 0)
words=list(vocab)
print(len(words))
for term in words:
numDocumentsWithThisTerm = 0
for doc in allDocuments:
if term in doc:
numDocumentsWithThisTerm = numDocumentsWithThisTerm + 1
idfDict[term]=math.log10(float(len(allDocuments)) / (float(numDocumentsWithThisTerm)+1))
# if numDocumentsWithThisTerm > 0:
# idfDict[term]=1.0 + math.log(float(len(allDocuments)) / numDocumentsWithThisTerm)
# else:
# idfDict[term]=1.0
return idfDict
#inputing our sentences in the log file
vocab=get_vocab(list_filtered_data)
idf=computeIDF(vocab,list_filtered_data)
- 计算TF
def computeTF(word_list):
tfDict = {}
corpusCount = len(word_list)
wordDict = word_count(word_list)
for word, count in wordDict.items():
tfDict[word] = count/float(corpusCount)
return(tfDict)
tfs=[]
for words in list_filtered_data:
tf=computeTF(words)
tfs.append(tf)
- 计算TF-IDF
def computeTFIDF(tfBow, idfs):
tfidf = {}
for word, val in tfBow.items():
tfidf[word] = val*idfs[word]
return(tfidf)
tfidfs=[]
for tf in tfs:
tfidf=computeTFIDF(tf,idf)
tfidfs.append(tfidf)
list_words=[]
list_values=[]
for tfidf in tfidfs:
d_order=sorted(tfidf.items(),key=lambda x:x[1],reverse=True) # 按字典集合中,每一个元组的第二个元素排列。
print(d_order[:10])
for k,v in d_order[:10]:
list_words.append(k)
list_values.append(v)
- 把计算的前10的tfidf的值输出到文件
list_names=[]
for txtname in txtfiles:
print(txtname)
for i in range(10):
list_names.append(txt_name.split('/')[-1][:-4])
import pandas as pd
list_res=[]
for i in range(len(list_words)):
list_res.append([list_names[i],list_words[i],list_values[i]])
column_name = ['Book title', 'Word','TF-IDF']
csv_name='Books_TFIDF_test_scratch.csv'
xml_df = pd.DataFrame(list_res, columns=column_name)
xml_df.to_csv(csv_name, index=None)
实现过程也是参考了很多网上的实现哈,如果代码有啥bug,欢迎跟我研究讨论哈,我平时也很少从头实现一个机器学习算法,不过实现了之后,发现学到的东西还是挺多的,看来学习需要下沉,不能停留在表面上
参考文献
[1].TF IDF | TFIDF Python Example. https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
[2].Creating TF-IDF Model from Scratch. https://www.kaggle.com/yassinehamdaoui1/creating-tf-idf-model-from-scratch