simhash算法
适用场景
检测文本相似度。
优势
整段文字计算哈希值比较文本相似度的方式,对于句中少量变更的情况,无法将两篇文章判定为相似的。因为这种方式,没有在结果中体现出局部的信息。
simhash基于关键词信息,针对这种情况可以算出接近的哈希值,判定出两篇文章为相似内容。
实现
提取关键词-词频信息
- 去除转义
html encode去除
StringEscapeUtils.unescapeHtml4(text)
- 去除符号
text.replaceAll("[^\\u4E00-\\u9FFFa-zA-Z0-9 \\n??。;;!!\\t\\xa0\\.\\p{P}]", ""
- 提取关键词
HanLP.segment(text)
保存最终结果为Map<String, Integer>类型
计算simhash
@Slf4j
public class SimHashUtil {
private static final int HASH_BITS = 64;
public static String simhash2Str(BigInteger simhash) {
return StringUtils.leftPad(simhash.toString(2), HASH_BITS, "0");
}
public static String simhashStr(String text) {
BigInteger simhash = simhash(text);
String simhashStr = simhash2Str(simhash);
return simhashStr;
}
public static BigInteger simhash(String text) {
text = text.toLowerCase();
text = StringEscapeUtils.unescapeHtml4(text);
Map<String, Integer> kcMap = text2KeywordCountMap(text);
return simhash(kcMap);
}
public static BigInteger simhash(Map<String, Integer> kcMap) {
List<BigInteger> kwHashes = new ArrayList<>();
int[] v = new int[HASH_BITS];
for (String kw : kcMap.keySet()) {
BigInteger kwHash = keywordHash(kw);
Integer weight = kcMap.get(kw);
if (containsChinese(kw)) {
weight = weight * 10;
}
kwHashes.add(kwHash);
for (int i = 0; i <HASH_BITS; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
if (kwHash.and(bitmask).signum() != 0) {
v[i] += weight;
} else {
v[i] -= weight;
}
}
}
BigInteger fingerprint = new BigInteger("0");
for (int i = 0; i < HASH_BITS; i++) {
if (v[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
}
}
return fingerprint;
}
public static BigInteger keywordHash(String source) {
if (source == null || source.length() == 0) {
return new BigInteger("0");
} else {
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(128).subtract(
new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2");
}
return x;
}
}
public static Map<String, Integer> text2KeywordCountMap(String input) {
Map<String, Integer> keywordMap = new HashMap<>();
try {
input = Jsoup.clean(input, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
} catch (Exception e){
log.error("simhash pre-processing failed", e);
return keywordMap;
}
keywordMap = HanlpUtil.extractTags(input);
return keywordMap;
}
public static boolean containsChinese(String word) {
Assert.hasLength(word, "keyword cannot be empty");
return word.matches("[\u4E00-\u9FA5]+");
}
}
计算海明距离
public int getSimhashHammingDistance(String hash1, String hash2) {
int distance = 0;
for (int i = 0; i < 64; i++) {
if (hash1.charAt(i) != hash2.charAt(i)) {
distance++;
}
}
return distance;
}
参考
- http://blog.sina.com.cn/s/blog_4f27dbd501013ysm.html