import numpy as np
import pandas as pd
str01 = "the hello my union left spark flink"
str02 = "hive hadoop spark my keep my hbase the is datatabase table partition"
str_list01 = str01.split(" ")
str_list02 = str02.split(" ")
wordset = set(str_list01).union(set(str_list02))
wordDict01 =dict.fromkeys(wordset,0)
wordDict02 =dict.fromkeys(wordset,0)
for str in str_list01:
wordDict01[str] += 1
for str in str_list02:
wordDict02[str] += 1
print(wordDict01)
print(wordDict02)
print(pd.DataFrame([wordDict01,wordDict02]))
def count_TF(worddict,bow):
tfdict = {}
number_bow = len(bow)
for word,count in worddict.items():
tfdict[word] = count / number_bow
return tfdict
print(count_TF(wordDict01,str_list01))
print(count_TF(wordDict02,str_list02))
def count_IDF(worddict_list):
idfidct = dict.fromkeys(worddict_list[0],0)
number_count = len(worddict_list)
import math
for worddict in worddict_list:
for word,count in worddict.items():
if count > 0:
idfidct[word] += 1
for word,count in idfidct.items():
idfidct[word] = math.log10((number_count + 1) / (count + 1))
return idfidct
print(count_IDF([wordDict01,wordDict02]))
def count_TFIDF(tf,idfs):
tfidf = {}
for word,count in tf.items():
tfidf[word] = count * idfs[word]
return tfidf
print(count_TFIDF(count_TF(wordDict01,str_list01),count_IDF([wordDict01,wordDict02])))
print(count_TFIDF(count_TF(wordDict02,str_list01),count_IDF([wordDict01,wordDict02])))