elasticsearch 实战

Elastic Stack传统上由三个主要组件(Elasticsearch,Logstash和Kibana)组成,早已脱离了这种组合,现在也可以与名为“ Beats”的第四个元素结合使用–一个针对不同用例的日志运送者系列。 现在网上有一种说法叫做ELKB,这里的B就是指的beats.

在集中式日志记录中,数据管道包括三个主要阶段:聚合,处理和存储。 在ELK堆栈中,传统上,前两个阶段是堆栈工作量Logstash的职责。执行这些任务需要付出一定的代价。 由于与Logstash的设计有关的内在问题,性能问题变得经常发生,尤其是在复杂的管道需要大量处理的情况下。将Logstash的部分职责外包的想法也应运而生,尤其是将数据提取任务转移到其他工具上。

实战

  1. 插入测试数据
POST forum_article/_bulk
{"index":{"_id":1}}
{"articleID":"article_1","userID":1,"hidden":false,"postDate":"2017-01-01"}
{"index":{"_id":2}}
{"articleID":"article_2","userID":1,"hidden":true,"postDate":"2017-01-02"}
{"index":{"_id":3}}
{"articleID":"article_3","userID":2,"hidden":false,"postDate":"2017-01-03"}
{"index":{"_id":4}}
{"articleID":"article_4","userID":2,"hidden":true,"postDate":"2017-01-04"}
POST forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"tag":["java","hadoop"]}}
{"update":{"_id":2}}
{"doc":{"tag":["java"]}}
{"update":{"_id":3}}
{"doc":{"tag":["hadoop"]}}
{"update":{"_id":4}}
{"doc":{"tag":["java","elasticsearch"]}}
POST /forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"title":"this is java,hadoop blog"}}
{"update":{"_id":2}}
{"doc":{"title":"this is java blog"}}
{"update":{"_id":3}}
{"doc":{"title":"this is hadoop blog"}}
{"update":{"_id":4}}
{"doc":{"title":"this is java,elasticsearch blog"}}
当type=text,默认会设置两个field,一个是field本身,是分词的,还有一个是field.keyword,默认不分词,最多保留256个字符。
{
  "forum_article" : {
    "mappings" : {
      "properties" : {
        "articleID" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "hidden" : {
          "type" : "boolean"
        },
        "postDate" : {
          "type" : "date"
        },
        "userID" : {
          "type" : "long"
        }
      }
    }
  }
}

term filter/query,对搜索文本部分词,直接拿去倒排索引中匹配
POST forum_article/_search?scroll=1m
{
  "query":{
    "constant_score": {
      "filter": {
        "term": {
          "userID": 1
        }
      },
      "boost": 1.2
    }
  },
  "size":1
}
POST forum_article/_search articleID建立倒排索引会分词,所以不分词搜索可以使用articleID.keyword
{
  "query":{
    "constant_score": {
      "filter": {
        "term": {
          "articleID.keyword": "article_1"
        }
      },
      "boost": 1.2
    }
  }
}
POST forum_article/_search   bool组合多个filter条件
{
  "query":{
    "constant_score": {
      "filter": {
        "bool": {
          "should":[
            {
              "term":{
                "postDate":"2017-01-01"
              }
            },
            { "bool":{
                "must":[
                  {
                    "term":{
                      "articleID":"article_1"
                    }
                  },
                  {
                    "term":{
                      "userID":1
                    }
                  }
                ]
              }
            }
          ]
              
        }
      },
      "boost": 1.2
    }
  }
}
POST forum_article/_search   terms搜索多个值
{
  "query": {
    "constant_score": {
      "filter": {
        "bool": {
          "must":[
            {
              "terms":{
                "tag":["java","elasticsearch"]
              }
            },
            {
              "term":{
                "articleID":"article_2"
              }
            }
          ]
        }
      },
      "boost": 1.2
    }
  }
}
POST forum_article/_search   range范围搜索,gt,lt,gte,lte
{
  "query": {
    "constant_score": {
      "filter": {
        "bool": {
          "must":[
            {
              "terms":{
                "tag":["java","elasticsearch"]
              }
            },
            {
              "range":{
                "postDate":{
                  "gt":"2017-01-01||-1m"
                }
              }
            }
          ]
        }
      },
      "boost": 1.2
    }
  }
}

POST /forum_article/_search   手动控制全文检索精准度
{
  "query": {
    "match": {
      "title": {
        "query": "java hadoop",
        "minimum_should_match": "50%"
      }
    }
  }
}
POST /forum_article/_search
{
  "query": {
    "match": {
      "title": {
        "query": "java hadoop",
        "operator": "and"
      }
    }
  }
}
POST /forum_article/_search
{
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title": "java"
          }
        },
        {
          "match": {
            "title": "hadoop"
          }
        },
        {
          "match": {
            "title": "elasticsearch"
          }
        },
        {
          "match": {
            "title": "spark"
          }
        }
      ],
      "minimum_should_match": 2
    }
  }
}
POST /forum_article/_search   dis_max实现best field策略
{
  "query": {
    "dis_max": {
      "tie_breaker": 0.7,
      "boost": 1.2,
      "queries": [
        {
          "bool": {
            "should": [
              {
                "match": {
                  "title": "java elasticsearch"
                }
              },
              {
                "match": {
                  "tag": "java elasticsearch"
                }
              }
            ],
            "minimum_should_match": 2
          }
        }
      ]
    }
  }
}
POST /forum_article/_search   multi_match实现dis_max best fields策略
{
  "query": {
    "multi_match": {
      "query": "java elasticsearch",
      "fields": ["title^2","tag"],
      "type": "best_fields",
      "minimum_should_match": 1
    }
  }
}
POST forum_article/_mapping   分词气english会对document倒排索引进行时态,大小写等转换,搜索结果会被影响,所以添加一个不进行时态等转换的standard分词器的field用于搜索
{
  "properties":{
    "sub_title":{
      "type":"text",
      "analyzer":"english",
      "fields":{
        "std":{
          "type":"text",
          "analyzer":"standard"
        }
      }
    }
  }
}
POST forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"sub_title":"learning more courses"}}
{"update":{"_id":2}}
{"doc":{"sub_title":"learned a lot of course"}}
{"update":{"_id":3}}
{"doc":{"sub_title":"learning course"}}
{"update":{"_id":4}}
{"doc":{"sub_title":"hello world"}}
POST forum_article/_search    使用most_fields策略
{
  "query": {
    "multi_match": {
      "query": "learned course",
      "type": "most_fields", 
      "fields": ["sub_title","sub_title.std"]
    }
  }
}
POST forum_article/_search  cross_fields+and实现要求每个term都在某个field出现
{
  "query": {
    "multi_match": {
      "query": "liang lily",
      "type": "cross_fields", 
      "operator": "and", 
      "fields": ["author_first_name","author_last_name"]
    }
  }
}
POST forum_article/_mapping   使用copy_to使用croll fields
{
  "properties":{
    "author_first_name":{
      "type":"text",
      "copy_to":"author_full_name"
    },
    "author_last_name":{
      "type":"text",
      "copy_to":"author_full_name"
    },
    "author_full_name":{
      "type":"text"
    }
  }
}
POST forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"author_first_name":"lily","author_last_name":"liang"}}
{"update":{"_id":2}}
{"doc":{"author_first_name":"peter","author_last_name":"ma"}}
{"update":{"_id":3}}
{"doc":{"author_first_name":"robbin","author_last_name":"li"}}
{"update":{"_id":4}}
{"doc":{"author_first_name":"smith","author_last_name":"zhou"}}

POST forum_article/_search  match_phrase通过slop决定搜索在field中的距离
{
  "query": {
    "match_phrase": {
      "title": "java blog"
    }
  }
}
POST forum_article/_search
{
  "query": {
    "match_phrase": {
      "title": {
        "query": "java blog",
        "slop": 1
      }
    }
  }
}
POST forum_article/_search  混合使用match和match_phrase实现召回率和精准度
{
  "query": {
    "bool": {
      "must": [
        {"match": {
          "title": "java blog"
        }}
      ],
      "should": [
        {
          "match_phrase": {
            "title": {
              "query": "java blog"
            }
          }
        }
      ]
    }
  }
}
POST forum_article/_search    match+rescore+match_phrase
{
  "query": {
    "bool": {
      "must": [
        {"match": {
          "title": "java blog"
        }}
      ]
    }
  },
  "rescore": {
    "query": {
      "rescore_query":{
        "match_phrase": {
          "title": "java blog"
        }
      }
    },
    "window_size": 50
  }
}
POST forum_article/_search    prefix前缀搜索,不计算score
{
  "query": {
    "bool": {
      "must": [
        {"prefix": {
          "title": "this"
        }}
      ]
    }
  }
}
POST forum_article/_search   搜索推荐
{
  "query": {
    "bool": {
      "must": [
        {"match_phrase_prefix": {
          "title": {
            "query":"this is ja*",
            "max_expansions": 1
          }
        }}
      ]
    }
  }
}
PUT forum_article1/    edge_ngram
{
  "settings": {
    "analysis": {
      "filter": {
        "autocomplete_filter":{
          "type":"edge_ngram",
          "min_gram":3,
          "max_gram":20
        }
      },
      "analyzer": {
        "autocomplete":{
          "type":"custom",
          "tokenizer":"standard",
          "filter":["lowercase","autocomplete_filter"]
        }
      }
    }
  }
}

知识点

  1. term filter:根据exact value进行搜索,数字,boolean,date天然支持;text需要建索引时指定type为keyword,才能用term query。
  2. filter过滤执行原理
    倒排索引中发现对应的document list -> 根据document list构建bitset,一个二进制数据,数组每个元素都是0或1,标识一个document对一个filter条件是否匹配 -> 遍历每个过滤条件对应的bitset,查找满足所有条件的document -> filter次数达到一定次数时,会将bitset进行cache,对于小segment(document<1000,或3%)不缓存其bitset,下次搜索不用扫描倒排索引 -> filter大部分情况下,在query之前执行,尽量过滤掉可能多的数据 -> 如果document有新增或修改,cache bitset会自动更新。
  3. 全文检索,使用match query,should;控制精准度:operator,minimum_should_match;match底层会转换为term+should/must。
  4. 多shard场景下,relevance score不准确。默认local shard去计算relevance score。
  5. best fields策略,某个field中匹配到了尽可能多的关键词,被排在前面,最匹配的在前面
  6. most fields策略,尽可能返回更多field匹配到的某个关键词的document,匹配均匀。
  7. cross_fields策略,
  8. 用copy_to,将多个字段的值复制到一个字段(隐藏字段),并建立倒排索引
  9. match_phrase,近似匹配,尽量让离得近的搜索词的document优先返回;term position;通过slop决定搜索在field中的距离。
  10. 召回率和精准度。混合使用match和match_phrase实现召回率和精准度。
  11. rescore。match匹配出来很多的doc,但我们一般只需要前面的几个doc,所以可以用match_phrase对结果再次对前面的几个doc进行rescore.
  12. 前缀搜索prefix(扫描到了一个前缀匹配的term不能停,必须继续搜索,直到扫描完整个的倒排索引),wildcard通配符搜索,regexp正则搜索;搜索性能都很差,都要搜索整个倒排索引。
  13. 搜索推荐,match_phrase_prefix,将最后一个term作为前缀搜索,max_expansions,指定prefix最多匹配多少个term就足够了,限定性能
  14. ngram,edge ngram(解决前缀搜索),index time,
    在这里插入图片描述
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值