需求背景:
http://3g.k.sohu.com/t/n384943815,如这篇搜狐新闻,内容存在明显重复,那为了保障线上文章质量,我们需要对文章段落进行判重。于是我们需要一个判重的逻辑和标准
算法逻辑:
获取文章段落,根据换行符\r、\n、<br/>、</br>进行分割,再循环段落,逐一与下一个段落进行比较,计算段落间的重复率(计算两字符间的最大公共子串长度/被比较段落字符长度),如果段落间重复率大于某个值(0.95),则判断此文章存在段落重复(此处可优化为判断整体段落重复率:重复段落数/总段落数)
算法实现:(直接上代码,不解释)
一、获取两字符串重复子串长度和重复子串
算法详解参考 https://www.cnblogs.com/Springtie/p/4068964.html
class CommonResult{
private String commonStr;
private int commonCount;
public CommonResult() {
}
public String getCommonStr() {
return commonStr;
}
public void setCommonStr(String commonStr) {
this.commonStr = commonStr;
}
public int getCommonCount() {
return commonCount;
}
public void setCommonCount(int commonCount) {
this.commonCount = commonCount;
}
}
public CommonResult maxUtilStr(String str1, String str2) {
CommonResult commonResult = new CommonResult();
//把字符串转成字符数组
char[] arr1 = str1.toCharArray();
char[] arr2 = str2.toCharArray();
// 把两个字符串分别以行和列组成一个二维矩阵
int[][] temp = new int[arr1.length][arr2.length];
// 存储最长公共子串长度
int length = 0;
//start表明最长公共子串的起始点,end表明最长公共子串的终止点
int end = 0;
int start = 0;
初始化二维矩阵中的第一行
for (int i = 0; i < arr2.length; i++) {
temp[0][i] = (arr1[0] == arr2[i]) ? 1 : 0;
}
//初始化二维矩阵中的第一列
for (int j = 0; j < arr1.length; j++) {
temp[j][0] = (arr2[0] == arr1[j]) ? 1 : 0;
}
//嵌套for循环:比较二维矩阵中每个点对应行列字符中否相等,相等的话值设置为1,否则设置为0
for (int i = 1; i < arr1.length; i++) {
for (int j = 1; j < arr2.length; j++) {
if (arr1[i] == arr2[j]) {
temp[i][j] = temp[i - 1][j - 1] + 1;
if (temp[i][j] > length) {
length = temp[i][j];
end = j;
}
} else {
temp[i][j] = 0;
}
}
}
//求出最长公共子串的起始点
start=end-length+1;
StringBuilder sb=new StringBuilder();
//通过查找出值为1的最长对角线就能找到最长公共子串
for (int j = start; j < end+1; j++) {
sb.append(arr2[j]);
}
commonResult.setCommonCount(length);
commonResult.setCommonStr(sb.toString());
return commonResult;
}
二、判断段落是否存在重复
public FilterResult apply(MrdNewsObject news) {
String co = removeAltAttr(news.getCo(),news.getOid());
String compareCo = co.replaceAll("\\pP|\\pS|[a-zA-Z0-9]", "");
int co_len = compareCo.length();
if(co_len>0){
Pattern patternS= Pattern.compile("\r|\n|(</?br/?>)");
Matcher matcherTS = patternS.matcher(co);
if(matcherTS.find()){
String sp = matcherTS.group();
String []colea = co.split("\\"+sp);
if(colea!=null&&colea.length>0){
Set<String> set=new HashSet<String>();
for(int i=0;i<colea.length;i++){
set.add(colea[i]);
}
LOG.info("part set size = "+set.size()+" colea size = "+colea.length+" oid ="+news.getOid());
// if(set.size()<colea.length){
// LOG.info(" news is duplication set size = "+set.size()+" colea size = "+colea.length+" nid = "+news.getNid()+" ds = "+news.getDs());
return new FilterResult(Level.BAD_CONTENT, "News content duplication filter");
// }
}
if(colea!=null&&colea.length>=2){
for(int i=0;i<colea.length;i++){
for(int j=i+1;j<colea.length;j++){
String NewPart1 = colea[i];
String NewPart2 = colea[j];
try {
NewPart1= NewPart1.replaceAll("\\pP|\\pS|[a-zA-Z0-9]", "");
NewPart2= NewPart2.replaceAll("\\pP|\\pS|[a-zA-Z0-9]", "");
} catch (Exception e) {
// TODO Auto-generated catch block
LOG.error("NewPart replace error e = "+e.toString());
e.printStackTrace();
}
if (NewPart1.length() >= 6 && NewPart2.length() >= 6) {
//int count = ContentDuplicationFilter.LCSLength(NewPart1.toCharArray(), NewPart2.toCharArray());
//System.out.println(" count = "+count);
CommonResult commonResult = maxUtilStr(NewPart1,NewPart2);
int count = commonResult.getCommonCount();
String commonStr = commonResult.getCommonStr();
double common_ratio = 0;
if (count > 0 && co_len > 0) {
if (NewPart1.length() > NewPart2.length()) {
common_ratio = 1.0 * count / NewPart1.length();
} else {
common_ratio = 1.0 * count / NewPart2.length();
}
if ((common_ratio > 0.95)&&NewPart1.length()>30&&NewPart2.length()>30) {
LOG.info("Common_ratio check and nid: = "
+ news.getNid() + " oid: = " + news.getOid()
+ " ds: = " + news.getDs()
+ " common_ratio: = " + common_ratio + " i = "
+ i + " index = " + i+" NewPart1 = "+NewPart1+" NewPart2 = "+NewPart2);
news.setReason(256);
return new FilterResult(Level.BAD_CONTENT, "News content duplication filter,commonStr:"+commonStr);
}else{
}
}
} else {
continue;
}
}
// boolean b = hasSame(colea,i,colea.length,news);
// if(b){
// return new FilterResult(Level.BAD_CONTENT, "News content duplication filter");
// }
}
}
}
}
if (co_len > 100) {
String part1 = co.substring(0, co.length() / 2);
// System.out.println(" part1 = ===============》》》 "+ part1);
String part2 = co.substring(co.length() / 2, co.length());
// System.out.println(" part2 = ===============》》》 "+ part2);
//int common_len = ContentDuplicationFilter.LCSLength(part1.toCharArray(), part2.toCharArray());
CommonResult commonResult = maxUtilStr(part1,part2);
int common_len = commonResult.getCommonCount();
String commonStr = commonResult.getCommonStr();
if (common_len > 0 && co_len > 0) {
double ratio = 1.0 * common_len / part1.length();
LOG.info("Content Duplicaton check, nid=" + news.getNid() + " oid=" + news.getOid() + " from=" + news.getFrom() + " ratio=" + ratio+" commonStr="+commonStr);
if (ratio > 0.3) {
news.setReason(256);
return new FilterResult(Level.BAD_CONTENT, "Content contains duplication, ratio=" + ratio+",commonStr="+commonStr);
}
}
}
if(news.getCo().length()==0&&news.getPic().length==0){
return new FilterResult(Level.BAD_CONTENT, "Content length duplication, length=" + news.getCo().length());
}
return new FilterResult(Level.FINE, "");
}