package com.jsptpd.wordpart;
import java.util.Arrays;
import java.util.List;
/**
* //TF-IDF算法——原理及实现
*
*/
public class App
{
/**
* 词频统计
*/
public double tf(List<String> doc,String item) {
double termFrequency = 0;
for(String str:doc) {
if(str.equalsIgnoreCase(item)) {
termFrequency++;
}
}
return termFrequency;
}
/***
* 文档频率统计
*/
public int df(List<List<String>> docs,String item) {
int n =0;
if(item != null && item != "") {
for(List<String> doc:docs) {
for(String word:doc) {
if(word.equalsIgnoreCase(item)) {
n++;
break;
}
}
}
}else {
System.out.println("item 不能为null或者空串");
}
return n;
}
/**
* 逆文档频率
*/
public double idf(List<List<String>> docs,String item) {
return Math.log(docs.size()/(double) df(docs,item)+1);
}
/*
* 词频
*/
public double tfIdf(List<String> doc,List<List<String>> docs,String item) {
return tf(doc,item)*idf(docs,item);
}
public static void main( String[] args )
{
List<String> doc1 = Arrays.asList("人工","智能","成为","互联网","大会","焦点");
List<String> doc2 = Arrays.asList("谷歌","推出","开源","人工","智能","系统","工具");
List<String> doc3 = Arrays.asList("互联网","的","未来","在","人工","智能");
List<String> doc4 = Arrays.asList("谷歌","开源","机器","学习","工具");
List<List<String>> documents = Arrays.asList(doc1,doc2,doc3,doc4);
App app1 = new App();
;
System.out.println(app1.tf(doc2, "谷歌"));
System.out.println(app1.df(documents, "谷歌"));
System.out.println(app1.tfIdf(doc4,documents, "学习"));
}
}
TF-IDF算法——原理及实现
最新推荐文章于 2025-05-18 20:33:51 发布