elasticsearch高级查询进阶

最新推荐文章于 2024-07-11 17:34:34 发布

成长的小牛233

最新推荐文章于 2024-07-11 17:34:34 发布

阅读量761

点赞数 2

分类专栏： # 搜索 ElasticSearch

本文链接：https://blog.youkuaiyun.com/dreamzuora/article/details/112606325

版权

搜索同时被 2 个专栏收录

32 篇文章

订阅专栏

ElasticSearch

6 篇文章

订阅专栏

文章目录

前期准备
应用场景
监控
- _stats索引监控

前期准备

索引mappings：

{
  "shop_titled_index": {
    "mappings": {
      "properties": {
        "brand": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "price": {
          "type": "long"
        },
        "region": {
          "type": "long"
        },
        "shopId": {
          "type": "long"
        },
        "skuId": {
          "type": "long"
        },
        "title": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        }
      }
    }
  }
}

准备数据：

      {
        "_index": "shop_titled_index",
        "_type": "_doc",
        "_id": "dJAM3HYByj_ONITHr0gq",
        "_score": 1,
        "_source": {
          "brand": "iphone",
          "price": 8000,
          "title": "iphone 12 64G red 5G",
          "skuId": 2020122201,
          "shopId": 2,
          "region": 1001
        }
      }

      {
        "_index": "shop_titled_index",
        "_type": "_doc",
        "_id": "9ZA6inYByj_ONITHT0bH",
        "_score": 1,
        "_source": {
          "brand": "iphone",
          "price": 8000,
          "title": "iphone 12 64G red 5G",
          "skuId": 2020122201,
          "shopId": 1,
          "region": 1001
        }
      }

应用场景

1.constant_score查询-不考虑文档频率得分，与搜索关键字命中更多的返回结果

{
  "query": {
    "bool": {
      "should": [
        {
          "constant_score": {
            "filter": {
              "match": {
                "title": "iphone"
              }
            },
            "boost": 1
          }
        },
        {
          "constant_score": {
            "filter": {
              "match": {
                "title": "12"
              }
            }
          }
        }
      ]
    }
  }

2.sort排序-分数相同情况下，按照指定价格域排序

{
  "query": {
    "bool": {
      "should": [
        {
          "constant_score": {
            "filter": {
              "match": {
                "title": "iphone"
              }
            },
            "boost": 1
          }
        },
        {
          "constant_score": {
            "filter": {
              "match": {
                "title": "12"
              }
            }
          }
        }
      ]
    }
  },
  "sort": [
    {
      "_score": {
        "order": "desc"
      }
    },
    {
      "price": {
        "order": "asc"
      }
    }
  ]
}

3.不考虑文档频率TF/IDF情况下，不同域打分权重不同进行召回

{
  "query": {
    "bool": {
      "should": [
        {
          "constant_score": {
            "filter": {
              "match": {
                "title": "red"
              }
            },
            "boost": 1
          }
        },
        {
          "constant_score": {
            "filter": {
              "match": {
                "brand": "iphone"
              }
            },
            "boost":3
          }
        }
      ]
    }
  },
  "sort":[
      {
          "_score":{
              "order":"desc"
          },
          "price":{
              "order":"asc"
          }
      }
      ]
}

4.不考虑文档频率TF/IDF情况下，不同域打分权重不同,再加上制定field的分数，最后最终得分返回，eg：title^3+content^1+time

{
  "query": {
    "function_score": {
      "query": {
        "bool": {
          "should": [
            {
              "constant_score": {
                "filter": {
                  "match": {
                    "title": "red"
                  }
                },
                "boost": 1
              }
            },
            {
              "constant_score": {
                "filter": {
                  "match": {
                    "brand": "iphone"
                  }
                },
                "boost": 3
              }
            }
          ]
        }
      },
      "field_value_factor": {
        "field": "shopId"
      },
      "boost_mode": "sum"
    }
  }
}

5.不考虑TFIDF得分，同一区域下，不同品牌权重不同

文档：https://www.elastic.co/guide/cn/elasticsearch/guide/current/function-score-filters.html

{
  "query": {
    "function_score": {
      "query": {
        "term": {
            "region":1002
        }
      },
      "boost": "1",
      "functions": [
        {
          "filter": {
            "term": {
              "brand.keyword": "huawei"
            }
          },
          "weight": 3
        },
        {
            "filter":{
                "match":{
                    "brand":"xiaomi"
                }
            },
            "weight":1
        }
      ],
      "score_mode": "sum",
      "boost_mode": "sum"
    }
  }
}

使用注意，以下查询会由于function_score没有主query，则会返回所有文档

{
  "query": {
    "function_score": {
      "functions": [
        {
          "filter": {
            "term": {
              "brand.keyword": "huawei"
            }
          },
          "weight": 3
        },
        {
            "filter":{
                "match":{
                    "brand":"xiaomi"
                }
            },
            "weight":1
        }
      ],
      "score_mode": "sum",
      "boost_mode": "sum"
    }
  }
}

6.如何基于地理位置查询，并且类似于自如租房查找周边价格便宜并且距离近的搜索，但是距离不会完全限定死？

参考文档：https://www.cnblogs.com/xiaoxiaoliu/p/11054405.html

新建索引
创建mappings

post geo_index/_mappings
{
  "properties": {
    "location": {
      "type": "geo_point"
    },
    "price": {
      "type": "double"
    },
    "name": {
      "type": "text"
    }
  }
}

3.准备数据

{
    "location":{
        "lon":"116.488781",
        "lat":"39.950565"
    },
    "price":"4000",
    "name":"朝阳公园 两室一厅 12m"
}

{
    "location":{
        "lon":"116.327805",
        "lat":"39.900988"
    },
    "price":"2400",
    "name":"北京西站 三室一厅 9m"
}

{
    "location": {
        "lon": "116.403981",
        "lat": "39.916485"
    },
    "price": "88888",
    "name": "故宫 无价之宝"
}

{
    "location": {
        "lon": "116.341316",
        "lat": "39.948795"
    },
    "price": "3700",
    "name": "北京动物园 三室一厅 19m"
}

4.geo_distance：找出附近两公里以内数据

GET geo_index/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "geo_distance": {
          "distance": "2km",
          "location": {
            "lat": 39.93869837,
            "lon": 116.48357391
          }
        }
      },
      "boost": 1.2
    }
  }
}

输出

{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 1,
      "relation": "eq"
    },
    "max_score": 1.2,
    "hits": [
      {
        "_index": "geo_index",
        "_type": "_doc",
        "_id": "1JC14HYByj_ONITHikiw",
        "_score": 1.2,
        "_source": {
          "location": {
            "lon": "116.488781",
            "lat": "39.950565"
          },
          "price": "4000",
          "name": "朝阳公园 两室一厅 12m"
        }
      }
    ]
  }
}

5.找出数据，并按照距离排序

文档：https://www.elastic.co/guide/cn/elasticsearch/guide/current/sorting-by-distance.html

{
  "query": {
    "constant_score": {
      "filter": {
        "geo_distance": {
          "distance": "10km",
          "location": {
            "lat": 39.93869837,
            "lon": 116.48357391
          }
        }
      },
      "boost": 1.2
    }
  },
  "sort": {
    "_geo_distance": {
      "location": [
        {
          "lat": 39.93869837,
          "lon": 116.48357391
        }
      ],
      "unit": "km",
      "distance_type": "arc",
      "order": "asc"
    }
  }
}

6.根据附近租房和价格查找数据

我更偏向距离更近，因此将权重调高
参考：https://www.elastic.co/guide/cn/elasticsearch/guide/current/decay-functions.html#CO119-4

{
  "query": {
    "function_score": {
      "query": {
          "range":{
              "price":{
                  "gte":2000,
                  "lte":5000
              }
          }
      },
      "functions": [
        {
          "gauss": {
            "location": {
              "origin": {
                "lon": "116.47464752",
                "lat": "39.94606859"
              },
              "offset": "100m",
              "scale": "1000m"
            }
          },
          "weight":2.0
        },
        {
          "gauss": {
            "price": {
              "origin": 3000,
              "offset": 100,
              "scale":500
            }
          }
        }
      ],
      "score_mode": "sum",
      "boost_mode": "replace"
    }
  }
}

结果：

{
  "took": 5,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 4,
      "relation": "eq"
    },
    "max_score": 0.7460326,
    "hits": [
      {
        "_index": "geo_index",
        "_type": "_doc",
        "_id": "95A14XYByj_ONITHg0if",
        "_score": 0.7460326,
        "_source": {
          "location": {
            "lon": "116.47155762",
            "lat": "39.9523853"
          },
          "price": "3500",
          "name": "亮马桥 两室一厅 12m"
        }
      },
      {
        "_index": "geo_index",
        "_type": "_doc",
        "_id": "1JC14HYByj_ONITHikiw",
        "_score": 0.36586136,
        "_source": {
          "location": {
            "lon": "116.488781",
            "lat": "39.950565"
          },
          "price": "4000",
          "name": "朝阳公园 两室一厅 12m"
        }
      },
      {
        "_index": "geo_index",
        "_type": "_doc",
        "_id": "1ZC34HYByj_ONITHRkht",
        "_score": 5.823735e-39,
        "_source": {
          "location": {
            "lon": "116.341316",
            "lat": "39.948795"
          },
          "price": "3700",
          "name": "北京动物园 三室一厅 19m"
        }
      },
      {
        "_index": "geo_index",
        "_type": "_doc",
        "_id": "1pC44HYByj_ONITHAkgJ",
        "_score": 0,
        "_source": {
          "location": {
            "lon": "116.327805",
            "lat": "39.900988"
          },
          "price": "2400",
          "name": "北京西站 三室一厅 9m"
        }
      }
    ]
  }
}

7.有些场景需要根据配置参数值进行排序，例如在所有手机中xiaomi手机得分最高？

function_score结合scrit_score排序

{
  "query": {
    "function_score": {
      "query": {
          "match_all":{}
      },
      "functions": [
        {
          "script_score": {
            "script": {
              "lang": "painless",
              "params": {
                "brand": "xiaomi"
              },
              "source": "if(doc['brand.keyword'].size() == 0)return 0f; String brandStr = doc['brand.keyword'].value ?: new String();if(params.brand.compareTo(brandStr) == 0){return 1f}return 0"
            }
          }
        }
      ],
      "score_mode":"sum",
      "boost_mode":"replace"
    }
  }
}

score_mode定义的是如何将各个function的分值合并成一个综合的分值； boost_mode则定义如何将这个综合的分值作用在原始query产生的分值上

8.bm25相似度调优，禁用归一化

BM25:bm25提供两个调参因子
k1:k1 这个参数控制着词频结果在词频饱和度中的上升速度。默认值为 1.2 。值越小饱和度变化越快，值越大饱和度变化越慢。词频饱和度可以参看下面官方文档的截图，图中反应了词频对应的得分曲线，k1 控制 tf of BM25 这条曲线。

b:这个参数控制着字段长归一值所起的作用， 0.0 会禁用归一化， 1.0 会启用完全归一化。默认值为 0.75

mapping设置

{
  "settings": {
    "index": {
      "number_of_shards": "1",
      "provided_name": "my_sim_index",
      "similarity": {
        "cbm25": {
          "type": "BM25",
          "b": "0"
        }
      },
      "creation_date": "1610181315498",
      "number_of_replicas": "1",
      "uuid": "V8NhMRofQRu-oPFt6hheWA",
      "version": {
        "created": "7070099"
      }
    }
  },
  "mappings": {
    "_doc": {
      "properties": {
        "body": {
          "similarity": "BM25",
          "type": "text"
        },
        "title": {
          "similarity": "cbm25",
          "type": "text"
        }
      }
    }
  }
}

数据准备

{
  "title": "Elasticsearch allows you to configure a scoring algorithm or similarity per field. The similarity setting provides a simple way of choosing a similarity algorithm other than the default BM25, such as TF/IDF.",
  "body": "Elasticsearch allows you to configure a scoring algorithm or similarity per field. The similarity setting provides a simple way of choosing a similarity algorithm other than the default BM25, such as TF/IDF."
}

{
  "title": "A simple boolean similarity, which is used when full-text ranking is not needed and the score should only be based on whether the query terms match or not. Boolean similarity gives terms a score equal to their query boost.",
  "body": "A simple boolean similarity, which is used when full-text ranking is not needed and the score should only be based on whether the query terms match or not. Boolean similarity gives terms a score equal to their query boost."
}

{
  "title": "or similarity per field. The similarity setting provides a simple way of choosing a similarity",
  "body": "or similarity per field. The similarity setting provides a simple way of choosing a similarity"
}

搜索
title用两cbm25忽略文档长度归一化，搜索结果与文档长度无关

GET my_sim_index/_search
{
    "query":{
        "match":{
            "title":"similarity"
        }
    }
}

输出：

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 3,
      "relation": "eq"
    },
    "max_score": 0.20983505,
    "hits": [
      {
        "_index": "my_sim_index",
        "_type": "_doc",
        "_id": "nZBO5nYByj_ONITHhknJ",
        "_score": 0.20983505,
        "_source": {
          "title": "Elasticsearch allows you to configure a scoring algorithm or similarity per field. The similarity setting provides a simple way of choosing a similarity algorithm other than the default BM25, such as TF/IDF.",
          "body": "Elasticsearch allows you to configure a scoring algorithm or similarity per field. The similarity setting provides a simple way of choosing a similarity algorithm other than the default BM25, such as TF/IDF."
        }
      },
      {
        "_index": "my_sim_index",
        "_type": "_doc",
        "_id": "oZBW5nYByj_ONITHkEli",
        "_score": 0.20983505,
        "_source": {
          "title": "or similarity per field. The similarity setting provides a simple way of choosing a similarity",
          "body": "or similarity per field. The similarity setting provides a simple way of choosing a similarity"
        }
      },
      {
        "_index": "my_sim_index",
        "_type": "_doc",
        "_id": "npBP5nYByj_ONITHK0mo",
        "_score": 0.18360566,
        "_source": {
          "title": "A simple boolean similarity, which is used when full-text ranking is not needed and the score should only be based on whether the query terms match or not. Boolean similarity gives terms a score equal to their query boost.",
          "body": "A simple boolean similarity, which is used when full-text ranking is not needed and the score should only be based on whether the query terms match or not. Boolean similarity gives terms a score equal to their query boost."
        }
      }
    ]
  }
}

0.20983505得分相同，尽管文档长度不一样

利用body搜索：

GET my_sim_index/_search
{
    "query":{
        "match":{
            "body":"similarity"
        }
    }
}

可以看出最后虽然都命中similary两次但是会受到文档长度影响

9.query_string使用：

{
    "query":{
        "query_string":{
            "query":"(title:red)^1.0 AND (brand:iphone)"
        }
    }
}

10.黄桃、罐头badcase-命中黄桃和罐头商品排在前面，没有完全命中排在后面解决方案

方案一：利用contant_score
添加一个忽略TFIDF得分并且自定义得分的查询过滤器用来给完全命中的商品排在前面

       "should": [
        {
          "constant_score": {
            "filter": {
              "query_string": {
                "query": "allWord:(+(黄桃) AND +(罐头))"
              }
            },
            "boost": 500
          }
        }
      ]

方案二
在原function_score查询语句下的functions里面添加过滤器并添加权重

          "function_score" : {
            "query" : {
              "bool" : {
                "must" : [
                  {
                    "query_string" : {
                      "query" : "(title:(+(黄桃 罐头))^2.4 OR catBrand:(+(黄桃 罐头))^0.6 OR facet:(+(黄桃 罐头))^0.6 OR allWord:(+(黄桃 罐头))^0.0)",
                      "fields" : [ ],
                      "use_dis_max" : true,
                      "tie_breaker" : 0.0,
                      "default_operator" : "or",
                      "auto_generate_phrase_queries" : false,
                      "max_determinized_states" : 10000,
                      "enable_position_increments" : true,
                      "fuzziness" : "AUTO",
                      "fuzzy_prefix_length" : 0,
                      "fuzzy_max_expansions" : 50,
                      "phrase_slop" : 0,
                      "escape" : false,
                      "split_on_whitespace" : true,
                      "boost" : 1.0
                    }
                  }
                ],
                "filter" : [
                  {
                    "term" : {
                      "skuDocType" : {
                        "value" : 1,
                        "boost" : 1.0
                      }
                    }
                  },
                  {
                    "bool" : {
                      "must_not" : [
                        {
                          "term" : {
                            "spMask" : {
                              "value" : 1,
                              "boost" : 1.0
                            }
                          }
                        }
                      ],
                      "disable_coord" : false,
                      "adjust_pure_negative" : true,
                      "boost" : 1.0
                    }
                  }
                ],
                "disable_coord" : false,
                "adjust_pure_negative" : true,
                "boost" : 1.0
              }
            },
            "functions" : [
              {
                "filter": {
                  "query_string": {
                    "query":"allWord:(黄桃 AND 罐头)"
                  }
                },
                "weight":400
              },
              {
                "filter" : {
                  "match_all" : {
                    "boost" : 1.0
                  }
                },
                "script_score" : {
                  "script" : {
                    "id" : "osop_score_script",
                    "lang" : "painless",
                    "params" : {
                      "catSearch" : false,
                      "fakeCat" : "cat16035591",
                      "weight" : true,
                      "topSku" : {
                        "pop8013634719" : 300.0,
                        "1130765898" : 300.0
                      },
                      "hotCatIds" : {
                        "cat16035591" : 0.9666818804198996
                      }
                    }
                  }
                }
              }
            ],
            "score_mode" : "sum",
            "boost_mode" : "sum",
            "max_boost" : 3.4028235E38,
            "boost" : 1.0
          }

监控

_stats索引监控

Elasticsearch Index Monitoring(索引监控)之Index Stats API详解
请求方式：

GET 索引名/_stats

参数解释：

1 {  2     "_nodes": {3     "total": 1,
  4     "successful": 1,
  5     "failed": 0
  6   },
  7   "cluster_name": "ELKTEST",
  8   "nodes": {
  9     "lnlHC8yERCKXCuAc_2DPCQ": {
 10       "timestamp": 1534242595995,
 11       "name": "OPS01-ES01",
 12       "transport_address": "10.9.125.148:9300",
 13       "host": "10.9.125.148",
 14       "ip": "10.9.125.148:9300",
 15       "roles": [
 16         "master",
 17         "data",
 18         "ingest"
 19       ],
 20       "attributes": {
 21         "ml.machine_memory": "8203104256",
 22         "xpack.installed": "true",
 23         "ml.max_open_jobs": "20",
 24         "ml.enabled": "true"
 25       },
 26       "indices": {
 27         "docs": {
 28           "count": 8111612,   # 显示节点上有多少文档
 29           "deleted": 16604    # 有多少已删除的文档还未从数据段中删除
 30         },
 31         "store": {
 32           "size_in_bytes": 2959876263  # 显示该节点消耗了多少物理存储
 33         },
 34         "indexing": {       #表示索引文档的次数，这个是通过一个计数器累加计数的。当文档被删除时，它不会减少。注意这个值永远是递增的，发生在内部索引数据的时候，包括那些更新操作
 35           "index_total": 17703152,
 36           "index_time_in_millis": 2801934,
 37           "index_current": 0,
 38           "index_failed": 0,
 39           "delete_total": 46242,
 40           "delete_time_in_millis": 2130,
 41           "delete_current": 0,
 42           "noop_update_total": 0,
 43           "is_throttled": false,
 44           "throttle_time_in_millis": 0    # 这个值高的时候，说明磁盘流量设置太低
 45         },
 46         "get": {
 47           "total": 185179,
 48           "time_in_millis": 22341,
 49           "exists_total": 185178,
 50           "exists_time_in_millis": 22337,
 51           "missing_total": 1,
 52           "missing_time_in_millis": 4,
 53           "current": 0
 54         },
 55         "search": {   
 56           "open_contexts": 0,   # 主动检索的次数，
 57           "query_total": 495447,    # 查询总数
 58           "query_time_in_millis": 298344,   # 节点启动到此查询消耗总时间，  query_time_in_millis / query_total的比值可以作为你的查询效率的粗略指标。比值越大，每个查询用的时间越多，你就需要考虑调整或者优化。
 59           "query_current": 0,　　　　　　　　 #后面关于fetch的统计，是描述了查询的第二个过程（也就是query_the_fetch里的fetch)。fetch花的时间比query的越多，表示你的磁盘很慢，或者你要fetch的的文档太多。或者你的查询参数分页条件太大，（例如size等于1万
 60           "fetch_total": 130194,
 61           "fetch_time_in_millis": 51211,
 62           "fetch_current": 0,
 63           "scroll_total": 22,
 64           "scroll_time_in_millis": 2196665,
 65           "scroll_current": 0,
 66           "suggest_total": 0,
 67           "suggest_time_in_millis": 0,
 68           "suggest_current": 0
 69         },
 70         "merges": { # 包含lucene段合并的信息，它会告诉你有多少段合并正在进行，参与的文档数，这些正在合并的段的总大小，以及花在merge上的总时间。　　　　　　　　　　　　　　　 如果你的集群写入比较多，这个merge的统计信息就很重要。merge操作会消耗大量的磁盘io和cpu资源。如果你的索引写入很多，你会看到大量的merge操作
 71           "current": 0,
 72           "current_docs": 0,
 73           "current_size_in_bytes": 0,
..