java 字符串相似度算法

package com.miaodi.api.util;

public class SimilarityRatioUtils {

	/**
	 * 莱茵斯坦距离
	 * 
	 * @param str
	 * @param target
	 * @return
	 */
	public static int levenshiteinDistance(String str, String target) {
		int s = str.length();
		int t = target.length();
		if (s == 0) {
			return t;
		}
		if (t == 0) {
			return s;
		}
		int d[][];
		int si;
		int tj;
		char strChar;
		char tagetChar;
		int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1
		d = new int[s + 1][t + 1];
		for (si = 0; si <= s; si++) {// 初始化第一列
			d[si][0] = si;
		}
		for (si = 1; si <= s; si++) {// 遍历str
			strChar = str.charAt(si - 1);
			for (tj = 1; tj <= t; tj++) {
				tagetChar = target.charAt(tj - 1);
				/*
				 * 忽略大小写 if (strChar == tagetChar || strChar == tagetChar + 32 || strChar + 32
				 * == tagetChar) {
				 */
				if (strChar == tagetChar) {
					temp = 0;
				} else {
					temp = 1;
				}
				d[si][tj] = min(d[si - 1][tj] + 1, d[si][tj - 1] + 1, d[si - 1][tj - 1] + temp);
			}
		}
		return d[s][t];
	}

	private static int min(int one, int two, int three) {
		return (one = one < two ? one : two) < three ? one : three;
	}

	/**
	 * 获取两字符串的相似度
	 */
	public static float getSimilarityRatio(String str, String target) {
		// 1-莱茵斯坦距离/最大字符串长度
		return 1 - (float) levenshiteinDistance(str, target) / Math.max(str.length(), target.length());
	}
}
package com.miaodi.api.handler;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;

/**
 * 模板提取
 * @author Administrator
 *
 */
public class TemplateDistill implements Callable<String>{

	public TemplateDistill(String source, String target, List<Future<String>> futures, ExecutorService pool, int startSourceIDX,int startTargetIDX, boolean lastIsPlaceholder,String templatePrefix,int variableNum) {
		super();
		this.source = source;
		this.target = target;
		this.futures = futures;
		this.pool = pool;
		this.startSourceIDX = startSourceIDX;
		this.startTargetIDX = startTargetIDX;
		this.lastIsPlaceholder = lastIsPlaceholder;
		this.templatePrefix = templatePrefix == null ? "":templatePrefix;
		this.variableNum = variableNum;
	}
	
	private String templatePrefix;
	
	private final int maxVariableNum = 3;
	
	private int variableNum;
	
	private int startSourceIDX;
	
	private int startTargetIDX;
	
	private boolean lastIsPlaceholder;

	private final String source;
	
	private final String target;

	private final List<Future<String>> futures;

	private final ExecutorService pool;
	
	@SuppressWarnings("unchecked")
	public String call() throws Exception {
		StringBuilder template = new StringBuilder(templatePrefix);
		while (true) {
			// 获取最大连击数
			Map<String, Object> result = carom(source, target, startSourceIDX, startTargetIDX);
			int maxCaromNum = (Integer) result.get("caromNum");
			if (maxCaromNum == 0) {
				if(!lastIsPlaceholder) {
					variableNum += 1;
					if (variableNum > maxVariableNum) {
						return null;
					}
					template.append("|");
					lastIsPlaceholder = true;
				}
				if (result.get("otherTargetIndexs") != null) {
					// 开启多线程找模板
					List<Integer> otherTargetIndexs = (List<Integer>) result.get("otherTargetIndexs");
					for (Integer otherTargetIndex : otherTargetIndexs) {
						futures.add(pool.submit(new TemplateDistill(source, target, futures, pool, startSourceIDX, otherTargetIndex, lastIsPlaceholder,template.toString(), variableNum)));
					}
				}
				startSourceIDX += 1;
			}else{
				template.append(source.substring(startSourceIDX, startSourceIDX + maxCaromNum));
				startSourceIDX += maxCaromNum;
				startTargetIDX += maxCaromNum;
				lastIsPlaceholder = false;;
			}
			if (source.length() <= startSourceIDX) {
				if (!lastIsPlaceholder && target.length() > startTargetIDX) {
					variableNum += 1;
					if (variableNum > maxVariableNum) {
						return null;
					}
					template.append("|");
				}
				break;
			}
			if (target.length() <= startTargetIDX) {
				if(!lastIsPlaceholder) {
					variableNum += 1;
					if (variableNum > maxVariableNum) {
						return null;
					}
					template.append("|");
				}
				break;
			}
		}
		return template.toString();
	}

	
	/**
	 * 获取连击数
	 * @param source 字符串s
	 * @param target 字符串t
	 * @param startSourceIDX 字符串s从什么下标开始匹配
	 * @param startTargetIDX 字符串t从什么下标开始匹配
	 * @return
	 */
	@SuppressWarnings({ "rawtypes", "unchecked" })
	private static Map<String,Object> carom(String source,String target,Integer startSourceIDX,Integer startTargetIDX){
		// 连击次数
		int attemptCaromNum = 0;
		while(true) {
			// 每次累加1,最后减1
			attemptCaromNum += 1;
			// 本次匹配的子串的结束下标
			int endSourceIDX = startSourceIDX + attemptCaromNum;
			if (endSourceIDX > source.length()) {
				break;
			}
			// 本次匹配的子串
			String attemptStr = source.substring(startSourceIDX, endSourceIDX);
			int idx = target.indexOf(attemptStr, startTargetIDX);
			// 匹配失败
			if (idx != startTargetIDX) {
				break;
			}
		}
		attemptCaromNum -= 1;
		// 结果
		Map<String, Object> result = new HashMap<String, Object>();
		result.put("caromNum", attemptCaromNum);
		// 匹配失败时
		if (attemptCaromNum == 0) {
			// 单纯的一个字符
			int endStrIDX = startSourceIDX + 1;
			if (endStrIDX <= source.length()) {
				// 匹配他失败后的字符
				List<Integer> otherTargetIndexs = new ArrayList();
				char c = source.charAt(startSourceIDX);
				char[] cs = target.substring(startTargetIDX+1).toCharArray();
				for (int i = 0; i < cs.length; i++) {
					if (c == cs[i]) {
						otherTargetIndexs.add(startTargetIDX+1+i);
					}
				}
				result.put("otherTargetIndexs", otherTargetIndexs.isEmpty() ? null : otherTargetIndexs);
			}
		}
		return result;
	}
	
}
package com.miaodi.api.handler;


import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import com.miaodi.api.util.SimilarityRatioUtils;

public class TemplateDistillUtil {
	
	
	private static final ExecutorService executorPool = new ThreadPoolExecutor(10, 30, 30, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(50),new ThreadPoolExecutor.CallerRunsPolicy());
	
	
	public static String distillTemplates(List<String> targets){
		if (targets.isEmpty()) {
			return null;
		}else if(targets.size() == 1) {
			return targets.get(0);
		}
		String source = targets.get(0);
		String template = null;
		for (int i = 1; i < targets.size(); i++) {
			if (i==1) {
				template = distillTemplates(source, targets.get(i));
			}else {
				if (template == null) {
					return null;
				}
				template = distillTemplates(template, targets.get(i));
			}
		}
		return format(template == null ? "|": template);
	}
	
	public static String distillTemplates(String source, String target) {
		
		// 符合这两字符串的全部模板
		List<Future<String>> futures = new Vector<Future<String>>();
		
		futures.add(executorPool.submit(new TemplateDistill(source,target,futures,executorPool,0,0,false,null,0)));
		
		Set<String> templates = new HashSet<String>();
		
		while (true) {
			int y = 0;
			for (int i = 0; i < futures.size(); i++) {
				try {
					y += 1;
					String template = futures.get(i).get();
					if (template != null) {
						templates.add(futures.get(i).get());
					}
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
			if (y == futures.size()) {
				break;
			}
		}
		float maxRatio = 0;
		String maxRatioTemplate = null;
		for (String template : templates) {
			float ratio = SimilarityRatioUtils.getSimilarityRatio(source, template);
			if (ratio > maxRatio) {
				maxRatio = ratio;
				maxRatioTemplate = template;
			}
		}
		return maxRatioTemplate;
	}
	
	public static String format(String template) {
		if (template == null) {
			return null;
		}
		StringBuilder sb = new StringBuilder();
		String[] vs = template.split("\\|");
		for (int i = 1; i <= vs.length; i++) {
			sb.append(vs[i-1]);
			if (i != vs.length) {
				sb.append("{"+i+"}");
			}
		}
		if (template.lastIndexOf("|") == template.length()-1) {
			sb.append("{"+((vs.length-1)<0?0:(vs.length-1))+"}");
		}
		return sb.toString();
	}
	
//	public static void main(String[] args) {
//		
//		
		System.out.println(format("123|123|"));
//		
//		long start = System.currentTimeMillis();
//		
//		String[] strs = new String[] {
				"廖兴攀-有线,在2017-10-01 11:53:22",
				"鲍冬梅-有线,在2017-10-01 11:53:11"
				"【兴业银行】尊敬的用户,于10月8日前点 https://rsp.mobi/N3eOs4 申请信用卡,可获批10万额度。退订回T",
				"【兴业银行】尊敬的用户,于10月8日前点 https://rsp.mobi/A5jk6Z 申请信用卡,可获批10万额度。退订回T"
				"【随你花】尊敬的王建辉,您的账单现在到期了,逾期将取消再借资格,请及时处理!需要链接找客服。回T退订",
				"【随你花】尊敬的项俊,您的账单现在到期了,逾期将取消再借资格,请及时处理!需要链接找客服。回T退订",
				"廖兴攀-有线,在2017-10-01 11:53:37触发了上线提醒,请留意。",
				"鲍冬梅-000有线,在2017-10-01 11:53:45触发了上线提醒,请留意。",
//				"123xxx12",
//				"123xxz12",
//				"123xxz221211",
				,"【银汉科技】尊贵的易美达分期客户:郭文娟,你总12期的第2期1866.67元,2017-12-31为本期还款日,到目前未还款,现已经逾期,请珍视个人信用,维护良好的账户记录和信誉。如有疑问,请致电4000-512-012,退订回T。"
//		};
		System.out.println(SimilarityRatioUtils.getSimilarityRatio(strs[0], strs[1]));
//		String templates = distillTemplates(strs);
		float maxRatio = 0;
		String maxRatioTemplate = null;
		for (String template : templates) {
			float ratio = SimilarityRatioUtils.getSimilarityRatio(strs[0], template);
			if (ratio > maxRatio) {
				maxRatio = ratio;
				maxRatioTemplate = template;
			}
		}
//		System.out.println("模板:"+templates);
		
		Matcher matcher = Pattern.compile("\\{\\d+\\}").matcher(maxRatioTemplate);
		String templete = matcher.replaceAll(".*");
		String regex = String.format("^%s$", templete); // 构建新的pattern字符串
		for (String string : strs) {
			System.out.println(Pattern.compile(regex).matcher(string).find());
		}
//		System.out.println("共计耗时:"+ (System.currentTimeMillis()-start) +"毫秒");
//	}

}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值