帮同学做的一个小project

最新推荐文章于 2025-04-23 22:37:07 发布

最新推荐文章于 2025-04-23 22:37:07 发布 · 109 阅读

文章标签：

#J#

java 专栏收录该内容

8 篇文章

订阅专栏

本文介绍了一种基于文本相似度计算的方法，并提供了具体的Java实现。该方法通过去除文本中的特殊字符并比较不同行间的相似度，实现了对文本中重复或相近内容的有效识别。文章还探讨了如何设置阈值来过滤相似度较低的比较结果。

前几天帮同学作了个很小的project,
[quote]Example1. Assume the content of the file is:
abc,def
abc def! ghi jkl?
xyz abc ppp
xyz xyz def
It contains fouro bjects.The first object consists of twotokens abc and def.The last object consists of three tokens xyz, xyz,def.
The similarity between the first object and the second object is 2.If the similarity threshold is 2,the
output of the join should be
01
Note that the following output is wrong.
10
If the threshold is 1,the output should be
01
02
12
03
13
23[/quote]
大体意思就是给定一个文件，里面有上面所说的那种类型的输入，然后你要把.,?!\t"; 这些字符去掉，然后比较每一行的相似度，程序会有一个参数，这个参数就是最小相似度，小于这个相似度的值都不会被打印。

[code]import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.io.*;

public class Data {

/**
* @param args
*/
private int threshold=0;
private final String regex="[\\.,? \\t!]++";
private String file;
private String mode;
private List <String> l=new ArrayList<String>();
private Map<String,List<Integer>> m=new HashMap<String,List<Integer>>();
public int sim(String[] s1,String[] s2){
int frag=0;
for(int i=0;i<s1.length;i++){
for(int j=0;j<s2.length;j++){
if(s1[i].equals(s2[j])){
frag++;
break;
}
}
}
return frag;
}
public void nSSJoin(List<String> l,int threshold){
for(int j=0;j<l.size();j++){
for(int i=j+1;i<l.size();i++){
int s=sim(l.get(j).split(regex),l.get(i).split(regex));
if(s>=threshold){
System.out.println(j+" "+i);
}
}
}
}
public void SSJoin(List<String> l,int threshold){
index(l);
for(int j=0;j<l.size();j++){
for(int i=j+1;i<l.size();i++){
int s=Ssim(l.get(j).split(regex),l.get(i).split(regex),i,j);
if(s>=threshold){
System.out.println(j+" "+i);
}
}
}
}
private int Ssim(String[] strings, String[] strings2,int i,int j) {
// TODO Auto-generated method stub
int tmp=0;
for(int k=0;k<strings.length;k++){
if(m.get(strings[k]).contains(i)){
tmp++;
}
}
return tmp;
}
public void index(List<String> l){
String[] temp1;
for(int i=0;i<l.size();i++){
temp1=l.get(i).split(regex);
for(int j=0;j<temp1.length;j++){
if(m.containsKey(temp1[j])){
m.get(temp1[j]).add(i);
}else{
List <Integer> l1=new ArrayList<Integer>();
l1.add(i);
m.put(temp1[j],l1);
}
}
}
}
public static void main(String[] args) throws IOException {
String inputLine;
Data d= new Data();
// TODO Auto-generated method stub
d.mode=args[0];
d.file=args[1];
d.threshold=Integer.valueOf(args[2]);
BufferedReader br= new BufferedReader(new FileReader(d.file));
while((inputLine=br.readLine())!=null){
d.l.add(inputLine);
}
//d.SSJoin(d.l, d.threshold);
//Map<String, int[]> k=d.index(d.l);
if(d.mode.equals("nSSJoin")){
d.nSSJoin(d.l,d.threshold);
}else if(d.mode.equals("SSJoin")){
d.SSJoin(d.l, d.threshold);
}else{
System.err.println("Mode is error");
}
}

}
[/code]