TF(Term Frequency)计算公式: TFi,j = Freq i,j / max Freq j
以上公式中Freq i,j 是该词在文件dj中的出现次数,max Freq j 是在文件dj中所有字词的出现次数之和.
class TFs
{
ArrayList<HashMap<String,Double>> TFsList = new ArrayList<HashMap<String, Double>>();
ArrayList<ArrayList<String>> TFsMainFileList = new ArrayList<ArrayList<String>>();
public TFs(ArrayList<ArrayList<String>> tf)
{
TFsMainFileList = tf;
}
public ArrayList<HashMap<String,Double>> PrintTFs()
{
for(int i=0; i<TFsMainFileList.size(); i++)
{
//TermTF use to save subFile of term and value
HashMap<String,Double> TermTF = new HashMap<String,Double>();
HashMap<String,Double> saveTF = new HashMap<String,Double>();
ArrayList<String> TFsSubFileList = TFsMainFileList.get(i);
int TermMaxFreq=0;//maxcount(max freqj) is the maximum number of times any term occurs is documentj.
for(int j=0; j<TFsSubFileList.size(); j++)
{
//Take elements from arraylist<hashmap<string,Double>>
if(!TermTF.containsKey(TFsSubFileList.get(j)))
{
TermTF.put(TFsSubFileList.get(j),1.0);
}
else
{
double value = TermTF.get(TFsSubFileList.get(j));
value ++;
TermTF.put(TFsSubFileList.get(j),value);
if(value > TermMaxFreq)
{
TermMaxFreq = (int)(value);
}
}
}
for(int v=0; v<TFsSubFileList.size(); v++)
{
if(!saveTF.containsKey(TFsSubFileList.get(v)))
{
double TermFreq = TermTF.get(TFsSubFileList.get(v));//where freqi,j is the number of times term i occurs in document j
double tfs = (double)TermFreq / (double)TermMaxFreq;
saveTF.put(TFsSubFileList.get(v),tfs);
}
}
TFsList.add(saveTF);
}
return TFsList;
}
}