推荐系统--------------TF-IDFpython代码实现_tf-tdf python实现代码-优快云博客

本文探讨了如何在大数据环境中结合使用Spark和Flink进行高效的数据处理。通过实例展示了如何利用这两种工具的优势，实现数据流的实时处理与离线分析。同时，还介绍了Hive和HBase在数据存储和查询中的作用，以及如何通过词频统计方法（TF-IDF）提取文本关键信息。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import  numpy as np
import pandas as pd

str01 = "the hello my union left spark flink"
str02 = "hive hadoop spark my keep my hbase the is datatabase table partition"
str_list01 = str01.split(" ")
str_list02 = str02.split(" ")
# 构建词库
wordset = set(str_list01).union(set(str_list02))
# 统计字典来保护词出现的次数
wordDict01 =dict.fromkeys(wordset,0)
wordDict02 =dict.fromkeys(wordset,0)
#遍历文档，统计词数
for str in str_list01:
    wordDict01[str] += 1
for str in str_list02:
    wordDict02[str] += 1
print(wordDict01)
print(wordDict02)
print(pd.DataFrame([wordDict01,wordDict02]))

# 计算词频
def count_TF(worddict,bow):
# 用一个字典对象记录tf，把所有的词对应的bow文档里面的tf都算出来
    tfdict = {}
    number_bow = len(bow)
    for word,count in worddict.items():
        tfdict[word] = count / number_bow
    return tfdict
print(count_TF(wordDict01,str_list01))
print(count_TF(wordDict02,str_list02))

# 计算逆文档频率IDF
def count_IDF(worddict_list):
    # 用一个字典对象保存idf结果，每个词作为key，初始值位0
    idfidct = dict.fromkeys(worddict_list[0],0)
    number_count = len(worddict_list)
    import math # 这个包是为了用数学中的log函数
    for worddict in worddict_list:
        # 遍历字典中的每一个词汇，统计count
        for word,count in worddict.items():
            if count > 0:
                # 先把count增加1，存入到idfdict
                idfidct[word] += 1
    # 已经得到所有词汇i对应的count，卸载根据公式把它替换成idf的值
    for word,count in idfidct.items():
        idfidct[word] = math.log10((number_count + 1) / (count + 1))
    return idfidct

print(count_IDF([wordDict01,wordDict02]))

# 计算TF-IDF
def count_TFIDF(tf,idfs):
    tfidf = {}
    for word,count in tf.items():
        tfidf[word] = count * idfs[word]
    return tfidf
print(count_TFIDF(count_TF(wordDict01,str_list01),count_IDF([wordDict01,wordDict02])))
print(count_TFIDF(count_TF(wordDict02,str_list01),count_IDF([wordDict01,wordDict02])))