keyword_list=[
['union','workers','strike','pay','rally','free','immigration',],
['farmer','plants','fruits','workers'],
['outside','field','party','clothes','fashions']]
def label_maker_topic(tokens, topic_words_broadcasted):
twt_list = []
for i in range(0, len(topic_words_broadcasted.value)):
count = 0
#print(topic_words[i])
for tkn in tokens:
if tkn in topic_words_broadcasted.value[i]:
count += 1
twt_list.append(count)
return twt_list
def make_topic_word_better(topic_words_broadcasted):
def f(c):
return label_maker_topic(c, topic_words_broadcasted)
return F.udf(f)
df = spark.createDataFrame([["union",], ["party",]]).toDF("tokens")
b = spark.sparkContext.broadcast(keyword_list)
df.withColumn("topics", make_topic_word_better(b)(F.col("tokens"))).show()