elasticsearch 实战-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_45627802/article/details/112978065

Elastic Stack传统上由三个主要组件（Elasticsearch，Logstash和Kibana）组成，早已脱离了这种组合，现在也可以与名为“ Beats”的第四个元素结合使用–一个针对不同用例的日志运送者系列。现在网上有一种说法叫做ELKB，这里的B就是指的beats.

在集中式日志记录中，数据管道包括三个主要阶段：聚合，处理和存储。在ELK堆栈中，传统上，前两个阶段是堆栈工作量Logstash的职责。执行这些任务需要付出一定的代价。由于与Logstash的设计有关的内在问题，性能问题变得经常发生，尤其是在复杂的管道需要大量处理的情况下。将Logstash的部分职责外包的想法也应运而生，尤其是将数据提取任务转移到其他工具上。

实战

插入测试数据

POST forum_article/_bulk
{"index":{"_id":1}}
{"articleID":"article_1","userID":1,"hidden":false,"postDate":"2017-01-01"}
{"index":{"_id":2}}
{"articleID":"article_2","userID":1,"hidden":true,"postDate":"2017-01-02"}
{"index":{"_id":3}}
{"articleID":"article_3","userID":2,"hidden":false,"postDate":"2017-01-03"}
{"index":{"_id":4}}
{"articleID":"article_4","userID":2,"hidden":true,"postDate":"2017-01-04"}
POST forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"tag":["java","hadoop"]}}
{"update":{"_id":2}}
{"doc":{"tag":["java"]}}
{"update":{"_id":3}}
{"doc":{"tag":["hadoop"]}}
{"update":{"_id":4}}
{"doc":{"tag":["java","elasticsearch"]}}
POST /forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"title":"this is java,hadoop blog"}}
{"update":{"_id":2}}
{"doc":{"title":"this is java blog"}}
{"update":{"_id":3}}
{"doc":{"title":"this is hadoop blog"}}
{"update":{"_id":4}}
{"doc":{"title":"this is java,elasticsearch blog"}}
当type=text，默认会设置两个field，一个是field本身，是分词的，还有一个是field.keyword,默认不分词，最多保留256个字符。
{
  "forum_article" : {
    "mappings" : {
      "properties" : {
        "articleID" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "hidden" : {
          "type" : "boolean"
        },
        "postDate" : {
          "type" : "date"
        },
        "userID" : {
          "type" : "long"
        }
      }
    }
  }
}

term filter/query,对搜索文本部分词，直接拿去倒排索引中匹配
POST forum_article/_search?scroll=1m
{
  "query":{
    "constant_score": {
      "filter": {
        "term": {
          "userID": 1
        }
      },
      "boost": 1.2
    }
  },
  "size":1
}
POST forum_article/_search articleID建立倒排索引会分词，所以不分词搜索可以使用articleID.keyword
{
  "query":{
    "constant_score": {
      "filter": {
        "term": {
          "articleID.keyword": "article_1"
        }
      },
      "boost": 1.2
    }
  }
}
POST forum_article/_search   bool组合多个filter条件
{
  "query":{
    "constant_score": {
      "filter": {
        "bool": {
          "should":[
            {
              "term":{
                "postDate":"2017-01-01"
              }
            },
            { "bool":{
                "must":[
                  {
                    "term":{
                      "articleID":"article_1"
                    }
                  },
                  {
                    "term":{
                      "userID":1
                    }
                  }
                ]
              }
            }
          ]
              
        }
      },
      "boost": 1.2
    }
  }
}
POST forum_article/_search   terms搜索多个值
{
  "query": {
    "constant_score": {
      "filter": {
        "bool": {
          "must":[
            {
              "terms":{
                "tag":["java","elasticsearch"]
              }
            },
            {
              "term":{
                "articleID":"article_2"
              }
            }
          ]
        }
      },
      "boost": 1.2
    }
  }
}
POST forum_article/_search   range范围搜索，gt,lt,gte,lte
{
  "query": {
    "constant_score": {
      "filter": {
        "bool": {
          "must":[
            {
              "terms":{
                "tag":["java","elasticsearch"]
              }
            },
            {
              "range":{
                "postDate":{
                  "gt":"2017-01-01||-1m"
                }
              }
            }
          ]
        }
      },
      "boost": 1.2
    }
  }
}

POST /forum_article/_search   手动控制全文检索精准度
{
  "query": {
    "match": {
      "title": {
        "query": "java hadoop",
        "minimum_should_match": "50%"
      }
    }
  }
}
POST /forum_article/_search
{
  "query": {
    "match": {
      "title": {
        "query": "java hadoop",
        "operator": "and"
      }
    }
  }
}
POST /forum_article/_search
{
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title": "java"
          }
        },
        {
          "match": {
            "title": "hadoop"
          }
        },
        {
          "match": {
            "title": "elasticsearch"
          }
        },
        {
          "match": {
            "title": "spark"
          }
        }
      ],
      "minimum_should_match": 2
    }
  }
}
POST /forum_article/_search   dis_max实现best field策略
{
  "query": {
    "dis_max": {
      "tie_breaker": 0.7,
      "boost": 1.2,
      "queries": [
        {
          "bool": {
            "should": [
              {
                "match": {
                  "title": "java elasticsearch"
                }
              },
              {
                "match": {
                  "tag": "java elasticsearch"
                }
              }
            ],
            "minimum_should_match": 2
          }
        }
      ]
    }
  }
}
POST /forum_article/_search   multi_match实现dis_max best fields策略
{
  "query": {
    "multi_match": {
      "query": "java elasticsearch",
      "fields": ["title^2","tag"],
      "type": "best_fields",
      "minimum_should_match": 1
    }
  }
}
POST forum_article/_mapping   分词气english会对document倒排索引进行时态，大小写等转换，搜索结果会被影响，所以添加一个不进行时态等转换的standard分词器的field用于搜索
{
  "properties":{
    "sub_title":{
      "type":"text",
      "analyzer":"english",
      "fields":{
        "std":{
          "type":"text",
          "analyzer":"standard"
        }
      }
    }
  }
}
POST forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"sub_title":"learning more courses"}}
{"update":{"_id":2}}
{"doc":{"sub_title":"learned a lot of course"}}
{"update":{"_id":3}}
{"doc":{"sub_title":"learning course"}}
{"update":{"_id":4}}
{"doc":{"sub_title":"hello world"}}
POST forum_article/_search    使用most_fields策略
{
  "query": {
    "multi_match": {
      "query": "learned course",
      "type": "most_fields", 
      "fields": ["sub_title","sub_title.std"]
    }
  }
}
POST forum_article/_search  cross_fields+and实现要求每个term都在某个field出现
{
  "query": {
    "multi_match": {
      "query": "liang lily",
      "type": "cross_fields", 
      "operator": "and", 
      "fields": ["author_first_name","author_last_name"]
    }
  }
}
POST forum_article/_mapping   使用copy_to使用croll fields
{
  "properties":{
    "author_first_name":{
      "type":"text",
      "copy_to":"author_full_name"
    },
    "author_last_name":{
      "type":"text",
      "copy_to":"author_full_name"
    },
    "author_full_name":{
      "type":"text"
    }
  }
}
POST forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"author_first_name":"lily","author_last_name":"liang"}}
{"update":{"_id":2}}
{"doc":{"author_first_name":"peter","author_last_name":"ma"}}
{"update":{"_id":3}}
{"doc":{"author_first_name":"robbin","author_last_name":"li"}}
{"update":{"_id":4}}
{"doc":{"author_first_name":"smith","author_last_name":"zhou"}}

POST forum_article/_search  match_phrase通过slop决定搜索在field中的距离
{
  "query": {
    "match_phrase": {
      "title": "java blog"
    }
  }
}
POST forum_article/_search
{
  "query": {
    "match_phrase": {
      "title": {
        "query": "java blog",
        "slop": 1
      }
    }
  }
}
POST forum_article/_search  混合使用match和match_phrase实现召回率和精准度
{
  "query": {
    "bool": {
      "must": [
        {"match": {
          "title": "java blog"
        }}
      ],
      "should": [
        {
          "match_phrase": {
            "title": {
              "query": "java blog"
            }
          }
        }
      ]
    }
  }
}
POST forum_article/_search    match+rescore+match_phrase
{
  "query": {
    "bool": {
      "must": [
        {"match": {
          "title": "java blog"
        }}
      ]
    }
  },
  "rescore": {
    "query": {
      "rescore_query":{
        "match_phrase": {
          "title": "java blog"
        }
      }
    },
    "window_size": 50
  }
}
POST forum_article/_search    prefix前缀搜索，不计算score
{
  "query": {
    "bool": {
      "must": [
        {"prefix": {
          "title": "this"
        }}
      ]
    }
  }
}
POST forum_article/_search   搜索推荐
{
  "query": {
    "bool": {
      "must": [
        {"match_phrase_prefix": {
          "title": {
            "query":"this is ja*",
            "max_expansions": 1
          }
        }}
      ]
    }
  }
}
PUT forum_article1/    edge_ngram
{
  "settings": {
    "analysis": {
      "filter": {
        "autocomplete_filter":{
          "type":"edge_ngram",
          "min_gram":3,
          "max_gram":20
        }
      },
      "analyzer": {
        "autocomplete":{
          "type":"custom",
          "tokenizer":"standard",
          "filter":["lowercase","autocomplete_filter"]
        }
      }
    }
  }
}

知识点

term filter：根据exact value进行搜索，数字，boolean，date天然支持；text需要建索引时指定type为keyword，才能用term query。
filter过滤执行原理
倒排索引中发现对应的document list -> 根据document list构建bitset，一个二进制数据，数组每个元素都是0或1，标识一个document对一个filter条件是否匹配 -> 遍历每个过滤条件对应的bitset，查找满足所有条件的document -> filter次数达到一定次数时，会将bitset进行cache，对于小segment(document<1000,或3%)不缓存其bitset，下次搜索不用扫描倒排索引 -> filter大部分情况下，在query之前执行，尽量过滤掉可能多的数据 -> 如果document有新增或修改，cache bitset会自动更新。
全文检索，使用match query，should;控制精准度：operator,minimum_should_match；match底层会转换为term+should/must。
多shard场景下，relevance score不准确。默认local shard去计算relevance score。
best fields策略，某个field中匹配到了尽可能多的关键词，被排在前面，最匹配的在前面
most fields策略，尽可能返回更多field匹配到的某个关键词的document，匹配均匀。
cross_fields策略，
用copy_to,将多个字段的值复制到一个字段（隐藏字段），并建立倒排索引
match_phrase，近似匹配，尽量让离得近的搜索词的document优先返回；term position；通过slop决定搜索在field中的距离。
召回率和精准度。混合使用match和match_phrase实现召回率和精准度。
rescore。match匹配出来很多的doc，但我们一般只需要前面的几个doc，所以可以用match_phrase对结果再次对前面的几个doc进行rescore.
前缀搜索prefix(扫描到了一个前缀匹配的term不能停，必须继续搜索，直到扫描完整个的倒排索引)，wildcard通配符搜索,regexp正则搜索；搜索性能都很差，都要搜索整个倒排索引。
搜索推荐，match_phrase_prefix，将最后一个term作为前缀搜索，max_expansions,指定prefix最多匹配多少个term就足够了，限定性能
ngram，edge ngram(解决前缀搜索)，index time，