接上篇文章使用ES对一段中文进行分词_zyydd_的博客-优快云博客。当我将某一段文字进行分词之后,会得到一个List。本篇文章讲的是,对该List,进行统计,得到单个词的数量,组织成一个Map<String, Long>。之后若存在多个类似的Map,则将这些Map进行合并(相同key的数量进行相加)。合并完成之后,得到一个总的Map,对该Map进行排序,得到总的词语出现次数的排行(也就是热词)。废话少说,上代码:
import org.apache.commons.collections.CollectionUtils;
import java.util.*;
import java.util.stream.Collectors;
/**
* 词云工具类
*/
public class FrequencyUtils {
public static void main(String[] args) {
List<String> a = JsonMapper.fromJson("[\"点击\",\"上方\",\"蓝字\",\"关注\",\"我们\",\"全体\",\"教职员工\",\"教职员\",\"教职\",\"职员\",\"员工\",\"家长\",\"朋友们\",\"朋友\",\"们\",\"你们\",\"好\",\"快乐\",\"而\",\"充实\",\"的\",\"暑期\",\"生活\",\"即将\",\"结束\",\"新学期\",\"新学\",\"学期\",\"的\",\"各项工作\",\"各项\",\"工作\",\"即将\",\"开启\",\"鉴于\",\"目前国内\",\"目前\",\"国内\",\"省内\",\"严峻\",\"复杂\",\"的\",\"疫情\",\"情形\",\"形势\",\"为\",\"进一步\",\"进一\",\"一步\",\"一\",\"步\",\"做好\",\"幼儿园\",\"幼儿\",\"园\",\"疫情\",\"防\",\"控\",\"工作\",\"为\",\"秋季\",\"开学\",\"创造\",\"良好条件\",\"良好\",\"条件\",\"确保\",\"返\",\"园\",\"后\",\"正常\",\"的\",\"教育\",\"教学秩序\",\"教学\",\"秩序\",\"现\",\"温馨\",\"提示\",\"如下\",\"一\",\"做好\",\"返\",\"安\",\"准备\",\"广大\",\"教职员工\",\"教职员\",\"教职\",\"职员\",\"员工\",\"及\",\"幼儿\",\"根据\",\"开学\",\"学时\",\"时间\",\"以及\",\"疫情\",\"情形\",\"形势\",\"变化\",\"预留\",\"留足\",\"足够\",\"时间\",\"至少\",\"少提\",\"提前\",\"7\",\"天\",\"返\",\"安\",\"或\",\"返回\",\"居住地\",\"居住\",\"住地\",\"即\",\"全体\",\"教师\",\"于\",\"2022\",\"年\",\"8\",\"月\",\"20\",\"日\",\"零时\",\"零\",\"时\",\"前\",\"返\",\"安\",\"全体\",\"幼儿\",\"于\",\"2022\",\"年\",\"8\",\"月\",\"24\",\"日\",\"零时\",\"零\",\"时\",\"前\",\"返\",\"安\",\"并\",\"严格\",\"落实\",\"实属\",\"属地\",\"单位\",\"报备\",\"社区\",\"报备\",\"健康\",\"管理\",\"要求\",\"二\",\"做好\",\"健康\",\"监测\",\"建议\",\"从\",\"外地\",\"返\",\"安\",\"的\",\"教职工\",\"教职\",\"职工\",\"幼儿\",\"及\",\"家长\",\"自觉\",\"进行\",\"3\",\"天\",\"2\",\"次\",\"核酸\",\"检测\",\"至少\",\"少间\",\"间隔\",\"24\",\"小时\",\"时\",\"并\",\"做好\",\"7\",\"天\",\"自我\",\"健康\",\"监测\",\"前\",\"3\",\"天\",\"原则上\",\"原则\",\"上\",\"两点\",\"两\",\"点\",\"一线\",\"一\",\"线\",\"少\",\"聚集\",\"少\",\"聚会\",\"时刻\",\"关注\",\"自己\",\"和家人\",\"家人\",\"的\",\"身体状况\",\"身体\",\"状况\",\"如\",\"出现\",\"发热\",\"干咳\",\"乏力\",\"嗅\",\"味\",\"觉\",\"减退\",\"鼻塞\",\"流涕\",\"咽\",\"痛\",\"结膜炎\",\"结膜\",\"膜炎\",\"肌\",\"痛\",\"和\",\"腹泻\",\"等\",\"症状\",\"及时\",\"到\",\"附近\",\"的\",\"发热\",\"热门\",\"门诊\",\"进行\",\"排查\",\"和\",\"诊疗\",\"就医\",\"过程\",\"尽量\",\"避免\",\"乘坐\",\"公共交通\",\"公共\",\"交通工具\",\"交通\",\"工具\",\"三\",\"做好\",\"重点\",\"防\",\"控\",\"近\",\"7\",\"日内\",\"日\",\"内有\",\"中\",\"高风险\",\"高风\",\"风险\",\"险区\",\"旅居\",\"或与\",\"相关\",\"关人\",\"人员\",\"有\",\"密切接触\",\"密切\",\"接触\",\"的\",\"教师\",\"幼儿\",\"返\",\"安\",\"前\",\"48\",\"小时\",\"向\",\"目的地\",\"目的\",\"地\",\"社区\",\"报备\",\"在\",\"抵\",\"安\",\"后\",\"12\",\"小时内\",\"小时\",\"时\",\"内向\",\"目的地\",\"目的\",\"地\",\"社区\",\"和\",\"幼儿园\",\"幼儿\",\"园\",\"报告\",\"并\",\"配合\",\"合做\",\"做好\",\"信息\",\"登记\",\"核酸\",\"检测\",\"集中\",\"中隔\",\"隔离\",\"或\",\"居家\",\"健康\",\"监测\",\"等\",\"管\",\"控\",\"措施\",\"四\",\"做好\",\"健康\",\"登记\",\"如实\",\"填写\",\"汉滨区\",\"铁路\",\"幼儿园\",\"幼儿\",\"园\",\"疫情\",\"防\",\"控\",\"返\",\"园\",\"承诺书\",\"承诺\",\"书\",\"及\",\"返\",\"园\",\"前\",\"健康\",\"监测\",\"登记表\",\"登记\",\"表\",\"并在\",\"开学\",\"当天\",\"天上\",\"上交\",\"纸质\",\"版\",\"给\",\"班级\",\"教师\",\"电子表格\",\"电子表\",\"电子\",\"子表\",\"表格\",\"已\",\"发至\",\"班级\",\"群\",\"新学期\",\"新学\",\"学期\",\"开学\",\"在即\",\"让我们\",\"我们\",\"一起\",\"一\",\"起\",\"做好\",\"返\",\"园\",\"前\",\"各项\",\"防\",\"控\",\"工作\",\"确保全\",\"确保\",\"保全\",\"全体\",\"教职工\",\"教职\",\"职工\",\"及\",\"幼儿\",\"安全\",\"返\",\"园\",\"祝\",\"大家\",\"身体健康\",\"身体\",\"健康\",\"暑假\",\"愉快\",\"汉滨区\",\"铁路\",\"幼儿园\",\"幼儿\",\"园\",\"2022\",\"年\",\"8\",\"月\",\"19\",\"日\",\"扫\",\"码\",\"关注\",\"分享\",\"给\",\"第一个\",\"第一\",\"一个\",\"一\",\"个\",\"想到\",\"的人\"]\n", List.class);
Map<String, Long> master = frequencyOfListQ(a, 3);//这个3表示,只有频率出现3次及以上的时候,才会被统计进来
System.out.println("List<String> 出现次数统计:" + JsonMapper.toJson(master));
Map<String, Long> s = new HashMap<>();
s.put("教职", 32L);
s.put("教职new_new_new", 102L);
mapMerge(master, s);//这里是s向master中合并,最后master中越来越多
System.out.println("Map合并后结果:" + master);
List<TermResult> tt = mapSort(master, 5);//这个5是说你最后想要多少个
System.out.println("Map排序后结果:" + JsonMapper.toJson(tt));
}
/**
* @param falcons
* @param min 只有频率出现min次及以上的时候,才会被统计进来
* @return
*/
public static Map<String, Long> frequencyOfListQ(List<String> falcons, Integer min) {
if (CollectionUtils.isEmpty(falcons)) {
return new HashMap<>();
}
Map<String, Long> result1 = falcons.stream().collect(Collectors.groupingBy(k -> k, Collectors.counting()));
for (Iterator<String> iterator = result1.keySet().iterator(); iterator.hasNext(); ) {
String key = iterator.next();
if (key.length() < 2) {
iterator.remove();
continue;
}
if (result1.get(key) < min) {
iterator.remove();
}
}
return result1;
}
/**
* s向master中合并,最后master中越来越多
*
* @param master
* @param s
*/
public static void mapMerge(Map<String, Long> master, Map<String, Long> s) {
if (master == null || master.isEmpty()) {
throw new RuntimeException("map合并,master为空");
}
if (s == null || s.isEmpty()) {
throw new RuntimeException("map合并,s为空");
}
s.forEach((key, value) -> master.merge(key, value, Long::sum));
}
/**
* map排序,最多要maxSize个
*
* @param map
* @param maxSize
* @return
*/
public static List<TermResult> mapSort(Map<String, Long> map, int maxSize) {
if (map == null || map.isEmpty()) {
return new ArrayList<>();
}
List<Map.Entry<String, Long>> list = new ArrayList<Map.Entry<String, Long>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<String, Long>>() {
@Override
public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
List<TermResult> r = new ArrayList<>();
for (int i = 0; i < list.size(); i++) {
if (i >= maxSize) {
break;
}
r.add(new TermResult(list.get(i).getKey(), list.get(i).getValue()));
}
return r;
}
static class TermResult {
private String key;
private Long count;
TermResult(String key, Long count) {
this.key = key;
this.count = count;
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public Long getCount() {
return count;
}
public void setCount(Long count) {
this.count = count;
}
}
}