-
以下提示采用了ik分词器和pinyin插件配合
https://github.com/medcl/elasticsearch-analysis-ik/releases https://github.com/medcl/elasticsearch-analysis-pinyin/releases
-
检验ik分词器和拼音插件是否生效
POST /_analyze { "analyzer":"pinyin", "text":"北京东" } POST /_analyze { "analyzer":"ik_max_word", "text":"北京东" }
拼音的分析结果 { "tokens": [ { "token": "bei", "start_offset": 0, "end_offset": 0, "type": "word", "position": 0 }, { "token": "jing", "start_offset": 0, "end_offset": 0, "type": "word", "position": 1 }, { "token": "dong", "start_offset": 0, "end_offset": 0, "type": "word", "position": 2 }, { "token": "bjd", "start_offset": 0, "end_offset": 0, "type": "word", "position": 2 } ] } -------- IK分词分析结果 { "tokens": [ { "token": "北京", "start_offset": 0, "end_offset": 2, "type": "CN_WORD", "position": 0 }, { "token": "京东", "start_offset": 1, "end_offset": 3, "type": "CN_WORD", "position": 1 } ] }
-
建立索引
{ "index": { "analysis": { "analyzer": { "pinyin_analyzer": { "tokenizer": "s-pinyin" }, "first_py_letter_analyzer": { "tokenizer": "first_py_letter" }, "full_pinyin_letter_analyzer": { "tokenizer": "full_pinyin_letter" } }, "tokenizer": { "s-pinyin": { "keep_joined_full_pinyin": "true", "keep_first_letter": "true", "keep_separate_first_letter": "false", "lowercase": "true", "type": "pinyin", "limit_first_letter_length": "16", "keep_original": "true", "keep_full_pinyin": "true", "keep_none_chinese_in_joined_full_pinyin": "true" }, "first_py_letter": { "type": "pinyin", "keep_first_letter": true, "keep_full_pinyin": false, "keep_original": false, "limit_first_letter_length": 16, "lowercase": true, "trim_whitespace": true, "keep_none_chinese_in_first_letter": false, "none_chinese_pinyin_tokenize": false, "keep_none_chinese": true, "keep_none_chinese_in_joined_full_pinyin": true }, "full_pinyin_letter": { "type": "pinyin", "keep_separate_first_letter": false, "keep_full_pinyin": false, "keep_original": false, "limit_first_letter_length": 16, "lowercase": true, "keep_first_letter": false, "keep_none_chinese_in_first_letter": false, "none_chinese_pinyin_tokenize": false, "keep_none_chinese": true, "keep_joined_full_pinyin": true, "keep_none_chinese_in_joined_full_pinyin": true } } } } }
-
建立mapping
{ "suggest-word": { "properties": { "suggest": { "type": "completion", "fields": { "s-pinyin": { "type": "completion", "analyzer": "pinyin_analyzer" }, "keyword-pinyin": { "type": "completion", "analyzer": "full_pinyin_letter_analyzer" }, "keyword-first-py": { "type": "completion", "analyzer": "first_py_letter_analyzer" }, "ik-word":{ "type": "completion", "analyzer": "ik_max_word" }, "standard-word":{ "type": "completion", "analyzer": "standard" } } } } } }
-
查询suggest
{ "suggest": { "text": "美白", "keyword_pinyin": { "completion": { "field": "suggest.keyword_pinyin" } }, "s-pinyin": { "completion": { "field": "suggest.s-pinyin" } }, "standard-word": { "completion": { "field": "suggest.standard-word" } }, "keyword_first_py": { "completion": { "field": "suggest.keyword_first_py" } }, "ik-word": { "completion": { "field": "suggest.ik-word" } } } }
结果是有5个,当然使用时不能都使用,需要根据不同的情况使用。
ik-word->s-pinyin->keyword_pinyin->keyword_first_py->standard-word
偏差越大的应当放在越后,用于补全等操作。
-
关于词库
词库应该是库内的专业词库,或者从搜索日志里捞出搜索量很大的词汇,充当搜索词建议。
-
Java API
public List<String> suggestWord(String text) { //Set<String> results = new TreeSet<String>() String indexName = "cb_es_ext_word"; CompletionSuggestionBuilder sPinyin = SuggestBuilders.completionSuggestion("suggest_spinyin").prefix(text); CompletionSuggestionBuilder standardWord = SuggestBuilders.completionSuggestion("suggest_standard").prefix(text); CompletionSuggestionBuilder keywordPinyin = SuggestBuilders.completionSuggestion("suggest_pinyin").prefix(text); CompletionSuggestionBuilder ikWord = SuggestBuilders.completionSuggestion("suggest_ik_word").prefix(text); CompletionSuggestionBuilder keywordFirstPy = SuggestBuilders.completionSuggestion("suggest_first_py").prefix(text); CompletionSuggestionBuilder suggestFuzzy = SuggestBuilders.completionSuggestion("suggest").prefix(text,Fuzziness.TWO); SearchRequest searchRequest = new SearchRequest().indices(indexName).types(ElasticSearchConstant.DEFAULT_TYPE_STR).source(new SearchSourceBuilder().suggest( new SuggestBuilder().addSuggestion("s-pinyin", sPinyin) .addSuggestion("standard-word", standardWord) .addSuggestion("keyword_pinyin", keywordPinyin) .addSuggestion("ik-word", ikWord) .addSuggestion("keyword-first-py", keywordFirstPy) .addSuggestion("suggest-fuzzy", suggestFuzzy) )); SearchResponse searchResponse = null; try { LOGGER.debug(" SearchRequest String:" + searchRequest.source().toString()); searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT); } catch (IOException e) { e.printStackTrace(); } //System.out.println(searchResponse); Suggest suggestions = searchResponse.getSuggest(); //合并词条 Map<String,Integer> suggestMap = new LinkedHashMap<>(); handlerSuggest(suggestions,suggestMap,"ik-word"); if(suggestMap.size() < 10){ handlerSuggest(suggestions,suggestMap,"s-pinyin"); handlerSuggest(suggestions,suggestMap,"keyword_pinyin"); } if(suggestMap.size() == 0){ handlerSuggest(suggestions,suggestMap,"standard-word"); } if(suggestMap.size() == 0){ handlerSuggest(suggestions,suggestMap,"keyword-first-py"); } if(suggestMap.size() == 0){ // 匹配文本相似度 根据专业词库纠正文本 handlerSuggest(suggestions,suggestMap,"suggest-fuzzy"); } System.out.println(JSON.toJSONString(suggestMap)); /* 1. 全中文词汇 采用ik-word ik分词 和standard 查询。 2. 含有英文和中文的 采用 s-pinyin keyword_pinyin 3. 全英文的就使用拼音 **/ List<String> suggestList = new ArrayList<>(); suggestMap.forEach((key,value)->{ suggestList.add(key); }); return suggestList; } private void handlerSuggest(Suggest suggestions,Map<String,Integer> suggestMap,String suggestName){ List<? extends Suggest.Suggestion.Entry<? extends Suggest.Suggestion.Entry.Option>> results = suggestions.getSuggestion(suggestName).getEntries(); for (Suggest.Suggestion.Entry<? extends Suggest.Suggestion.Entry.Option> op : results) { List<? extends Suggest.Suggestion.Entry.Option> options = op.getOptions(); for (Suggest.Suggestion.Entry.Option pp : options) { if (suggestMap.containsKey(pp.getText().toString())) { suggestMap.put(pp.getText().toString(), suggestMap.get(pp.getText().toString()) + 1); } else { suggestMap.put(pp.getText().toString(), 1); } } } }
-
使用weight
数据结构
{ "suggest":{ "input":"联想词", "weight":10 } }
使用weight 不能使用 fields 字段 映射
需建立多个字段来分词
{ "mappings": { "content_bank_entity": { "properties": { "suggest_spinyin": { "max_input_length": 50, "analyzer": "pinyin_analyzer", "preserve_position_increments": true, "type": "completion", "preserve_separators": true }, "suggest_standard": { "max_input_length": 50, "analyzer": "standard", "preserve_position_increments": true, "type": "completion", "preserve_separators": true }, "suggest_first_py": { "max_input_length": 50, "analyzer": "first_py_letter_analyzer", "preserve_position_increments": true, "type": "completion", "preserve_separators": true }, "suggest": { "max_input_length": 50, "analyzer": "simple", "preserve_position_increments": true, "type": "completion", "preserve_separators": true }, "suggest_ik_word": { "max_input_length": 50, "analyzer": "ik_max_word", "preserve_position_increments": true, "type": "completion", "preserve_separators": true }, "suggest_pinyin": { "max_input_length": 50, "analyzer": "full_pinyin_letter_analyzer", "preserve_position_increments": true, "type": "completion", "preserve_separators": true } } } } }
搜索语句改成
{ "suggest": { "text": "quchensh", "keyword_pinyin": { "completion": { "field": "suggest_pinyin" } }, "s-pinyin": { "completion": { "field": "suggest_spinyin" } }, "standard-word": { "completion": { "field": "suggest_standard" } }, "keyword_first_py": { "completion": { "field": "suggest_first_py" } }, "ik-word": { "completion": { "field": "suggest_ik_word" } } } }
-
参考资料
主要参考https://blog.youkuaiyun.com/baifanwudi/article/details/88662561 https://blog.youkuaiyun.com/wwd0501/article/details/80885987
https://www.jianshu.com/p/9e2c6a8e1b54
系统学习suggest
https://blog.youkuaiyun.com/supermao1013/article/details/84311057 https://www.cnblogs.com/wangzhuxing/p/9574630.html#_label2
自动纠错 用于英文可以 ,中文不行
https://blog.youkuaiyun.com/Insightzen_xian/article/details/80692366
https://learnku.com/articles/37090
辅助
https://www.jianshu.com/p/8a6b80813a34
自定义分词器
https://www.cnblogs.com/shoufeng/p/10562746.html
ES Mapping、字段类型Field type详解
https://blog.youkuaiyun.com/ZYC88888/article/details/83059040
可以使用百度文本纠错接口
ElasticSearch Suggest 提示(生产使用)
最新推荐文章于 2024-12-31 14:31:30 发布