TF-IDF算法详解-优快云博客

本文链接：https://blog.youkuaiyun.com/dawei1980/article/details/6730770

本文详细介绍了TF-IDF算法中的TF（Term Frequency）部分计算原理及实现过程，通过具体示例解释了如何计算一个词在文档中的频率，并展示了用于计算TF值的Java类实现。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

TF(Term Frequency)计算公式: TFi,j = Freq i,j / max Freq j

以上公式中Freq i,j 是该词在文件dj中的出现次数，max Freq j 是在文件dj中所有字词的出现次数之和.

class TFs
{
ArrayList<HashMap<String,Double>> TFsList = new ArrayList<HashMap<String, Double>>();
ArrayList<ArrayList<String>> TFsMainFileList = new ArrayList<ArrayList<String>>();

public TFs(ArrayList<ArrayList<String>> tf)
{
   TFsMainFileList = tf;
}

public ArrayList<HashMap<String,Double>> PrintTFs()
{

  for(int i=0; i<TFsMainFileList.size(); i++)
  {
   //TermTF use to save subFile of term and value
   HashMap<String,Double> TermTF = new HashMap<String,Double>();
   HashMap<String,Double> saveTF = new HashMap<String,Double>();
   ArrayList<String> TFsSubFileList = TFsMainFileList.get(i);

   int TermMaxFreq=0;//maxcount(max freqj) is the maximum number of times any term occurs is documentj.

   for(int j=0; j<TFsSubFileList.size(); j++)
   {
    //Take elements from arraylist<hashmap<string,Double>>

    if(!TermTF.containsKey(TFsSubFileList.get(j)))
    {
     TermTF.put(TFsSubFileList.get(j),1.0);
    }
    else
    {
     double value = TermTF.get(TFsSubFileList.get(j));
     value ++;
     TermTF.put(TFsSubFileList.get(j),value);

     if(value > TermMaxFreq)
     {
      TermMaxFreq = (int)(value);
     }
    }
   }

   for(int v=0; v<TFsSubFileList.size(); v++)
   {
    if(!saveTF.containsKey(TFsSubFileList.get(v)))
    {
     double TermFreq = TermTF.get(TFsSubFileList.get(v));//where freqi,j is the number of times term i occurs in document j

     double tfs = (double)TermFreq / (double)TermMaxFreq;

     saveTF.put(TFsSubFileList.get(v),tfs);

    }
   }
   TFsList.add(saveTF);
  }
  return TFsList;
}
}

TF-IDF 的计算二