判断文章内容重复率的算法实现

最新推荐文章于 2023-12-03 16:15:57 发布

yougubaihe_ss

最新推荐文章于 2023-12-03 16:15:57 发布

阅读量5.1k

点赞数 2

分类专栏：算法文章标签：文本排重

本文链接：https://blog.youkuaiyun.com/yougubaihe_ss/article/details/100535728

版权

算法专栏收录该内容

1 篇文章

订阅专栏

需求背景：

http://3g.k.sohu.com/t/n384943815，如这篇搜狐新闻，内容存在明显重复，那为了保障线上文章质量，我们需要对文章段落进行判重。于是我们需要一个判重的逻辑和标准

算法逻辑：

获取文章段落，根据换行符\r、\n、<br/>、</br>进行分割，再循环段落，逐一与下一个段落进行比较，计算段落间的重复率（计算两字符间的最大公共子串长度/被比较段落字符长度），如果段落间重复率大于某个值（0.95），则判断此文章存在段落重复（此处可优化为判断整体段落重复率：重复段落数/总段落数）

算法实现：（直接上代码，不解释）

一、获取两字符串重复子串长度和重复子串

算法详解参考 https://www.cnblogs.com/Springtie/p/4068964.html

class CommonResult{
		private String commonStr;
		private int commonCount;

		public CommonResult() {
		}

		public String getCommonStr() {
			return commonStr;
		}

		public void setCommonStr(String commonStr) {
			this.commonStr = commonStr;
		}

		public int getCommonCount() {
			return commonCount;
		}

		public void setCommonCount(int commonCount) {
			this.commonCount = commonCount;
		}
	}


public  CommonResult maxUtilStr(String str1, String str2) {
		CommonResult commonResult = new CommonResult();
		//把字符串转成字符数组
		char[] arr1 = str1.toCharArray();
		char[] arr2 = str2.toCharArray();
		// 把两个字符串分别以行和列组成一个二维矩阵
		int[][] temp = new int[arr1.length][arr2.length];
		// 存储最长公共子串长度
		int length = 0;
		//start表明最长公共子串的起始点，end表明最长公共子串的终止点
		int end = 0;
		int start = 0;
		初始化二维矩阵中的第一行
		for (int i = 0; i < arr2.length; i++) {
			temp[0][i] = (arr1[0] == arr2[i]) ? 1 : 0;
		}
		//初始化二维矩阵中的第一列
		for (int j = 0; j < arr1.length; j++) {
			temp[j][0] = (arr2[0] == arr1[j]) ? 1 : 0;
		}
		//嵌套for循环：比较二维矩阵中每个点对应行列字符中否相等，相等的话值设置为1，否则设置为0
		for (int i = 1; i < arr1.length; i++) {
			for (int j = 1; j < arr2.length; j++) {
				if (arr1[i] == arr2[j]) {
					temp[i][j] = temp[i - 1][j - 1] + 1;

					if (temp[i][j] > length) {
						length = temp[i][j];
						end = j;
					}
				} else {
					temp[i][j] = 0;
				}
			}
		}
		//求出最长公共子串的起始点
		start=end-length+1;
		StringBuilder sb=new StringBuilder();
		//通过查找出值为1的最长对角线就能找到最长公共子串
		for (int j = start; j < end+1; j++) {
			sb.append(arr2[j]);
		}
		commonResult.setCommonCount(length);
		commonResult.setCommonStr(sb.toString());
		return commonResult;
	}

二、判断段落是否存在重复

 public FilterResult apply(MrdNewsObject news) {
	String co = removeAltAttr(news.getCo(),news.getOid());
        String compareCo = co.replaceAll("\\pP|\\pS|[a-zA-Z0-9]", "");
        int co_len = compareCo.length();
        if(co_len>0){
        	Pattern patternS= Pattern.compile("\r|\n|(</?br/?>)");
       	 	Matcher matcherTS = patternS.matcher(co);
       	   if(matcherTS.find()){
         		  String sp = matcherTS.group();
         		  String []colea = co.split("\\"+sp);
         		 if(colea!=null&&colea.length>0){
         	   		 Set<String> set=new HashSet<String>();
              		for(int i=0;i<colea.length;i++){
              			set.add(colea[i]);
              		}
              		LOG.info("part set size = "+set.size()+" colea size = "+colea.length+" oid ="+news.getOid());
//              		if(set.size()<colea.length){
//              			LOG.info(" news is duplication  set size = "+set.size()+" colea size =  "+colea.length+" nid = "+news.getNid()+" ds = "+news.getDs());
              			 return new FilterResult(Level.BAD_CONTENT, "News content duplication filter");
//              		}
         		 }
         		 if(colea!=null&&colea.length>=2){
         			 for(int i=0;i<colea.length;i++){
         				 for(int j=i+1;j<colea.length;j++){
         				
         						String NewPart1 = colea[i];
         						
         						String NewPart2 = colea[j];
         						try {
         							NewPart1=	NewPart1.replaceAll("\\pP|\\pS|[a-zA-Z0-9]", "");
         							NewPart2=	NewPart2.replaceAll("\\pP|\\pS|[a-zA-Z0-9]", "");
         						} catch (Exception e) {
         							// TODO Auto-generated catch block
         							LOG.error("NewPart replace error e = "+e.toString());
         							e.printStackTrace();
         						}
         						if (NewPart1.length() >= 6 && NewPart2.length() >= 6) {
         							//int count = ContentDuplicationFilter.LCSLength(NewPart1.toCharArray(), NewPart2.toCharArray());
         							//System.out.println(" count  = "+count);
									CommonResult commonResult = maxUtilStr(NewPart1,NewPart2);
									int count = commonResult.getCommonCount();
									String commonStr = commonResult.getCommonStr();
         							double common_ratio = 0;
         							if (count > 0 && co_len > 0) {
         								if (NewPart1.length() > NewPart2.length()) {
         									common_ratio = 1.0 * count / NewPart1.length();
         								} else {
         									common_ratio = 1.0 * count / NewPart2.length();
         								}
         								if ((common_ratio > 0.95)&&NewPart1.length()>30&&NewPart2.length()>30) {
         									LOG.info("Common_ratio check and nid: = "
         											+ news.getNid() + " oid: = " + news.getOid()
         											+ " ds: = " + news.getDs()
         											+ " common_ratio: = " + common_ratio + " i = "
         											+ i + " index = " + i+" NewPart1 = "+NewPart1+" NewPart2 = "+NewPart2);
         									news.setReason(256);
         									 return new FilterResult(Level.BAD_CONTENT, "News content duplication filter,commonStr:"+commonStr);
         								}else{
         								}
         							}
         						} else {
         						    continue;
         						}


         				 }
//         				boolean b = hasSame(colea,i,colea.length,news); 
//         				if(b){
//         					 return new FilterResult(Level.BAD_CONTENT, "News content duplication filter");
//         				}
         			 } 
         		 } 
            }
        }
      
   
        
        if (co_len > 100) {
            String part1 = co.substring(0, co.length() / 2);
//            System.out.println(" part1 = ===============》》》 "+ part1);
            
            String part2 = co.substring(co.length() / 2, co.length());
//            System.out.println(" part2 = ===============》》》 "+ part2);
           
            //int common_len = ContentDuplicationFilter.LCSLength(part1.toCharArray(), part2.toCharArray());
            CommonResult commonResult = maxUtilStr(part1,part2);
            int common_len = commonResult.getCommonCount();
            String commonStr = commonResult.getCommonStr();
            if (common_len > 0 && co_len > 0) {
                double ratio = 1.0 * common_len / part1.length();
                LOG.info("Content Duplicaton check, nid=" + news.getNid() + " oid=" + news.getOid() + " from=" + news.getFrom() + " ratio=" + ratio+" commonStr="+commonStr);
                if (ratio > 0.3) {
                	news.setReason(256);
                    return new FilterResult(Level.BAD_CONTENT, "Content contains duplication, ratio=" + ratio+",commonStr="+commonStr);
                }
            }
        }

        if(news.getCo().length()==0&&news.getPic().length==0){
        	 return new FilterResult(Level.BAD_CONTENT, "Content length duplication, length=" + news.getCo().length());
        }
        return new FilterResult(Level.FINE, "");
    }