参考:Sprint Boot 工程中HanLP配置相对路径,始终有问题的解决方案_springboot hanlp-优快云博客
<!--hanlp 依赖-->
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.4</version>
</dependency>
public class ResourceFileIoAdapter implements IIOAdapter {
@Override
public InputStream open(String path) throws IOException {
ClassPathResource resource = new ClassPathResource(path);
// return Files.newInputStream(resource.getFile().toPath());
// Linux环境下跑要把open()里改成这样:
return resource.getInputStream();
}
@Override
public OutputStream create(String path) throws IOException {
ClassPathResource resource = new ClassPathResource(path);
OutputStream os = new FileOutputStream(resource.getFile());
return os;
}
}
http://nlp.hankcs.com/download.php?file=data
下载后
其中数据分为词典和模型,其中词典是词法分析必需的,模型是句法分析必需的,用户可以自行增删替换,如果不需要句法分析等功能的话,随时可以删除model
文件夹
实践使用
从文本中提取关键字
// 提取名词关键字
public static String extractOptimizedKeywords(String text, int keywordCount) {
// 获取短语
List<String> termList = HanLP.extractPhrase(text, 100);
// 只保留都是名词的短语
List<String> termNounsList = filterOnlyNounsFromPhrases(termList);
System.out.println(termNounsList);
// 计算TF-IDF值并排序
Map<String, Double> tfidfMap = calculateTfidf(termNounsList, text);
List<Map.Entry<String, Double>> sortedKeywords = tfidfMap.entrySet().stream()
.sorted(Map.Entry.<String, Double>comparingByValue().reversed())
.collect(Collectors.toList());
// 返回出现次数最多的前keywordCount个复合关键字
List<String> keywords = sortedKeywords.stream()
.limit(keywordCount)
.map(Map.Entry::getKey)
.collect(Collectors.toList());
// 将关键词列表转换为逗号分隔的字符串
return String.join(", ", keywords);
}
// 获取名词短语
public static List<String> filterOnlyNounsFromPhrases(List<String> phrases) {
List<String> nounOnlyPhrases = new ArrayList<>();
for (String phrase : phrases) {
// 对每个短语进行分词和词性标注
List<Term> terms = HanLP.segment(phrase);
// 检查短语中是否所有词都是名词
boolean allNouns = terms.stream()
.allMatch(term -> term.nature.toString().startsWith("n"));
// 如果短语中所有词都是名词,则保留该短语
if (allNouns) {
nounOnlyPhrases.add(phrase);
}
}
return nounOnlyPhrases;
}
// 计算TF-IDF值并排序
private static Map<String, Double> calculateTfidf(List<String> terms, String corpus) {
Map<String, Integer> termFrequency = terms.stream()
.collect(Collectors.groupingBy(Function.identity(), Collectors.summingInt(v -> 1)));
Map<String, Integer> documentFrequency = calculateDocumentFrequency(corpus);
Map<String, Double> tfidf = new HashMap<>();
for (Map.Entry<String, Integer> entry : termFrequency.entrySet()) {
String term = entry.getKey();
int freq = entry.getValue();
int df = documentFrequency.getOrDefault(term, 1); // 确保df至少为1,避免log(0)
double tf = (double) freq / terms.size();
double idf = Math.log((double) 1 + corpus.length() / df); // 调整idf公式,用语料库长度作为文档数量的代理
double tfidfValue = tf * idf;
tfidf.put(term, tfidfValue);
}
return tfidf;
}
private static Map<String, Integer> calculateDocumentFrequency(String corpus) {
List<Term> docTerms = HanLP.segment(corpus);
return docTerms.stream()
.filter(term -> term.nature.toString().startsWith("n")) // 使用startsWith("n")匹配任何名词性质
.collect(Collectors.groupingBy(term -> term.word, Collectors.summingInt(v -> 1)));
}