from pyspark import SparkConf, SparkContext
import math
def word_contains(words_list):
words_set=set(words_list)
return list(words_set)
def computeIDF(word_df_tuple,num_document):
word=word_df_tuple[0]
df=word_df_tuple[1]
word_idf = math.log(float(num_document+1) / float(df+1), 2)
return (word, word_idf)
def computeTF(words_list, all_words_list):
words_num=len(words_list)
words_dic={}
for word in words_list:
if word in words_dic.keys():
words_dic[word]+=1
else:
words_dic[word]=1
tf_vector=[]
for word in all_words_list:
if word in words_dic.keys():
tf=float(words_dic[word])/words_num
tf_vector.append(tf)
else:
tf_vector.append(0)
return tf_vector
def computeTFIDF(tf_vector, words_idf_dic,all_words_list):
i=0
tfidf_vector=[]
for word in all_words_list:
tfidf=tf_vector[i]*words_idf_dic[word]
tfidf_vector.append(tfidf)
i+=1
return tfidf_vector
def nomoralize(tfidf_vector):
new_vector=[]
sum=0
for item in tfidf_vector:
sum+=math.pow(item,2)
sqrt_sum=math.sqrt(sum)
for item in tfidf_vector:
new_item=item/sqrt_sum
new_vector.append(new_item)
return new_vector
if __name__ == "__main__":
conf = SparkConf().setAppName("tfidf")
sc = SparkContext(conf=conf)
documents_list=[["hello","world","china","good","spark","good"],
["hello","china","china","great","love","china"],
["love","spark","spark","good","hello","spark"]]
tokenized_document_rdd=sc.parallelize(documents_list).cache()
print "*************************** compute idf************************************"
num_document=tokenized_document_rdd.count()
words_df_rdd=tokenized_document_rdd.flatMap(lambda words_list:word_contains(words_list)) \
.map(lambda word:(word,1)) \
.reduceByKey(lambda a,b:a+b)
words_idf_rdd=words_df_rdd.map(lambda word_df_tuple:
computeIDF(word_df_tuple, num_document))
print "*********************************** compute tf *******************************"
all_words_list= tokenized_document_rdd.flatMap(lambda words_list:words_list) \
.distinct() \
.collect()
all_words_broadcast=sc.broadcast(all_words_list)
document_tf_rdd= tokenized_document_rdd.map(lambda words_list:
computeTF(words_list, all_words_broadcast.value))
print "******************************* compute tfidf*********************************"
words_idf_list= words_idf_rdd.collect()
words_idf_dic={}
for item in words_idf_list:
words_idf_dic[item[0]]=item[1]
words_idf_broadcast=sc.broadcast(words_idf_dic)
document_tfidf_rdd= document_tf_rdd.map(lambda words_tf_list:computeTFIDF(words_tf_list,
words_idf_broadcast.value,all_words_broadcast.value))
normalized_document_tfidf_rdd= document_tfidf_rdd.map(lambda tfidf_vector:
nomoralize(tfidf_vector))
print "************************** print tfidf vectors*********************************"
tfidf_vectors= normalized_document_tfidf_rdd.collect()
for item in tfidf_vectors:
print item