自然语言处理_tf-idf

import pandas as pd
import math

1.数据预处理

docA = "The cat sat on my face"
docB = "The dog sat on my bed"

wordsA = docA.split(" ")
wordsB = docB.split(" ")

wordsSet = set(wordsA).union(set(wordsB))
print(wordsSet)
{'on', 'my', 'face', 'sat', 'dog', 'The', 'cat', 'bed'}

2.计算词的频数

wordCountA = dict.fromkeys(wordsSet, 0)
wordCountB = dict.fromkeys(wordsSet, 0)

for word in wordsA:
    wordCountA[word] += 1
for word in wordsB:
    wordCountB[word] += 1

pd.DataFrame([wordCountA, wordCountB])    
onmyfacesatdogThecatbed
011110110
111011101

3.计算词的频率

def computeTF(wordCount, docWords):
    tfDict = {}
    docCount = len(docWords)
    for word, count in wordCount.items():
        tfDict[word] = count / float(docCount)
    return tfDict

tfA = computeTF(wordCountA, wordsA)
tfB = computeTF(wordCountB, wordsB)
print("tfA ", tfA)
tfA  {'on': 0.16666666666666666, 'my': 0.16666666666666666, 'face': 0.16666666666666666, 'sat': 0.16666666666666666, 'dog': 0.0, 'The': 0.16666666666666666, 'cat': 0.16666666666666666, 'bed': 0.0}

4.计算逆文档频率

def computeIDF(docList):
    idfDict = {}
    doc_len = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    
    for doc in docList:
        for word, count in doc.items():
            if count > 0:
                idfDict[word] += 1
      
    for word, count in idfDict.items():
        idfDict[word] = math.log10((doc_len + 1) / float(count + 1))
    return idfDict

idf = computeIDF([wordCountA, wordCountB])
print(idf)
{'on': 0.0, 'my': 0.0, 'face': 0.17609125905568124, 'sat': 0.0, 'dog': 0.17609125905568124, 'The': 0.0, 'cat': 0.17609125905568124, 'bed': 0.17609125905568124}

5.计算 TF-IDF

def computeTFIDF(tf, idf):
    tfidf = {}
    for word, tf in tf.items():
        tfidf[word] = tf * idf[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idf)
tfidfB = computeTFIDF(tfB, idf)
pd.DataFrame([tfidfA, tfidfB])
onmyfacesatdogThecatbed
00.00.00.0293490.00.0000000.00.0293490.000000
10.00.00.0000000.00.0293490.00.0000000.029349
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值