首先在pom文件中加入分词处理的JAR包
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.6.1</version>
</dependency>
1、创建封装分词数据的Bean
package com.qlys.frame.model.impl;
import java.io.Serializable;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 分词基本信息
*/
public class SegmentWord implements Serializable {
/**
*
*/
private static final long serialVersionUID = 5662341029767237202L;
// 词名
private String name;
// 词性
private String pos;
// 词频
private AtomicInteger frequency = new AtomicInteger();
public SegmentWord(String name,String pos) {
this.name = name;
this.pos = pos;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getPos() {
return pos;
}
public void setPos(String pos) {
this.pos = pos;
}
public AtomicInteger getFrequency() {
return frequency;
}
public void setFrequency(AtomicInteger frequency) {
this.frequency = frequency;
}
}
2、进行分词并形成词频信息
package com.qlys.frame.util;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.NLPTokenizer;
import com.qlys.frame.model.impl.SegmentWord;
/**
* 分词处理工具类
*/
public class TokenizerUtil {
private static final Logger log = LoggerFactory.getLogger(TokenizerUtil.class);
/**
* 分词策略
* @return 分词信息
*/
public List<Term> segmentCategory(String content){
return NLPTokenizer.segment(content);
}
/**
* 分词并计算词频
* @param content 需要进行分词的文本
* @return 分词后的词组信息及词频
*/
public Map<String, SegmentWord> segment(String content) {
log.debug("开始执行分词");
List<Term> termList = this.segmentCategory(content);
Map<String, SegmentWord> map = new ConcurrentHashMap<String, SegmentWord>();
termList.forEach(i -> Optional.ofNullable(i.nature == Nature.w ? null : i.nature).ifPresent(m -> map
.computeIfAbsent(i.word, k -> new SegmentWord(i.word, i.nature.toString())).getFrequency().incrementAndGet()));
return map;
}
/**
* 相似度运算
*
* @param s 分词1
* @param o 分词2
* @return 分词1和分词2的相似度
*/
public double similarity(Map<String, SegmentWord> s,Map<String, SegmentWord> o) {
List<String> keys = new ArrayList<String>();
keys.addAll(s.keySet());
keys.retainAll(o.keySet());
//运算分子数据
return keys.stream().map(val->s.get(val).getFrequency().intValue()*o.get(val).getFrequency().intValue()).reduce((a,b)->a+b).get()
/
Math.sqrt(s.values().stream().map(val->Math.pow(val.getFrequency().intValue(),2)).reduce((a,b)->a+b).get())
/ Math.sqrt(o.values().stream().map(val->Math.pow(val.getFrequency().intValue(),2)).reduce((a,b)->a+b).get());
}
public static void main(String[] args) {
TokenizerUtil util = new TokenizerUtil();
Map<String, SegmentWord> map = util.segment("我们把香蕉给猴子因为它们饿了");
Map<String, SegmentWord> map1 = util.segment("我们不能把香蕉给猴子因为它们还没有成熟");
System.out.println(util.similarity(map, map1));
System.out.println(util.similarity(util.segment("abc123"), util.segment("abc")));
}
}