文章目录
- 前期准备
- 应用场景
- 1.constant_score查询-不考虑文档频率得分,与搜索关键字命中更多的返回结果
- 2.sort排序-分数相同情况下,按照指定价格域排序
- 3.不考虑文档频率TF/IDF情况下,不同域打分权重不同进行召回
- 4.不考虑文档频率TF/IDF情况下,不同域打分权重不同,再加上制定field的分数,最后最终得分返回,eg:title\^3\+content^1+time
- 5.不考虑TFIDF得分,同一区域下,不同品牌权重不同
- 6.如何基于地理位置查询,并且类似于自如租房查找周边价格便宜并且距离近的搜索,但是距离不会完全限定死?
- 7.有些场景需要根据配置参数值进行排序,例如在所有手机中xiaomi手机得分最高?
- 8.bm25相似度调优,禁用归一化
- 9.query_string使用:
- 10.黄桃、罐头badcase-命中黄桃和罐头商品排在前面,没有完全命中排在后面解决方案
- 监控
前期准备
索引mappings:
{
"shop_titled_index": {
"mappings": {
"properties": {
"brand": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"price": {
"type": "long"
},
"region": {
"type": "long"
},
"shopId": {
"type": "long"
},
"skuId": {
"type": "long"
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
准备数据:
{
"_index": "shop_titled_index",
"_type": "_doc",
"_id": "dJAM3HYByj_ONITHr0gq",
"_score": 1,
"_source": {
"brand": "iphone",
"price": 8000,
"title": "iphone 12 64G red 5G",
"skuId": 2020122201,
"shopId": 2,
"region": 1001
}
}
{
"_index": "shop_titled_index",
"_type": "_doc",
"_id": "9ZA6inYByj_ONITHT0bH",
"_score": 1,
"_source": {
"brand": "iphone",
"price": 8000,
"title": "iphone 12 64G red 5G",
"skuId": 2020122201,
"shopId": 1,
"region": 1001
}
}
应用场景
1.constant_score查询-不考虑文档频率得分,与搜索关键字命中更多的返回结果
{
"query": {
"bool": {
"should": [
{
"constant_score": {
"filter": {
"match": {
"title": "iphone"
}
},
"boost": 1
}
},
{
"constant_score": {
"filter": {
"match": {
"title": "12"
}
}
}
}
]
}
}
2.sort排序-分数相同情况下,按照指定价格域排序
{
"query": {
"bool": {
"should": [
{
"constant_score": {
"filter": {
"match": {
"title": "iphone"
}
},
"boost": 1
}
},
{
"constant_score": {
"filter": {
"match": {
"title": "12"
}
}
}
}
]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"price": {
"order": "asc"
}
}
]
}
3.不考虑文档频率TF/IDF情况下,不同域打分权重不同进行召回
{
"query": {
"bool": {
"should": [
{
"constant_score": {
"filter": {
"match": {
"title": "red"
}
},
"boost": 1
}
},
{
"constant_score": {
"filter": {
"match": {
"brand": "iphone"
}
},
"boost":3
}
}
]
}
},
"sort":[
{
"_score":{
"order":"desc"
},
"price":{
"order":"asc"
}
}
]
}
4.不考虑文档频率TF/IDF情况下,不同域打分权重不同,再加上制定field的分数,最后最终得分返回,eg:title^3+content^1+time
{
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"constant_score": {
"filter": {
"match": {
"title": "red"
}
},
"boost": 1
}
},
{
"constant_score": {
"filter": {
"match": {
"brand": "iphone"
}
},
"boost": 3
}
}
]
}
},
"field_value_factor": {
"field": "shopId"
},
"boost_mode": "sum"
}
}
}
5.不考虑TFIDF得分,同一区域下,不同品牌权重不同
文档:https://www.elastic.co/guide/cn/elasticsearch/guide/current/function-score-filters.html
{
"query": {
"function_score": {
"query": {
"term": {
"region":1002
}
},
"boost": "1",
"functions": [
{
"filter": {
"term": {
"brand.keyword": "huawei"
}
},
"weight": 3
},
{
"filter":{
"match":{
"brand":"xiaomi"
}
},
"weight":1
}
],
"score_mode": "sum",
"boost_mode": "sum"
}
}
}
使用注意,以下查询会由于function_score没有主query,则会返回所有文档
{
"query": {
"function_score": {
"functions": [
{
"filter": {
"term": {
"brand.keyword": "huawei"
}
},
"weight": 3
},
{
"filter":{
"match":{
"brand":"xiaomi"
}
},
"weight":1
}
],
"score_mode": "sum",
"boost_mode": "sum"
}
}
}
6.如何基于地理位置查询,并且类似于自如租房查找周边价格便宜并且距离近的搜索,但是距离不会完全限定死?
参考文档:https://www.cnblogs.com/xiaoxiaoliu/p/11054405.html
- 新建索引
- 创建mappings
post geo_index/_mappings
{
"properties": {
"location": {
"type": "geo_point"
},
"price": {
"type": "double"
},
"name": {
"type": "text"
}
}
}
3.准备数据
{
"location":{
"lon":"116.488781",
"lat":"39.950565"
},
"price":"4000",
"name":"朝阳公园 两室一厅 12m"
}
{
"location":{
"lon":"116.327805",
"lat":"39.900988"
},
"price":"2400",
"name":"北京西站 三室一厅 9m"
}
{
"location": {
"lon": "116.403981",
"lat": "39.916485"
},
"price": "88888",
"name": "故宫 无价之宝"
}
{
"location": {
"lon": "116.341316",
"lat": "39.948795"
},
"price": "3700",
"name": "北京动物园 三室一厅 19m"
}
4.geo_distance:找出附近两公里以内数据
GET geo_index/_search
{
"query": {
"constant_score": {
"filter": {
"geo_distance": {
"distance": "2km",
"location": {
"lat": 39.93869837,
"lon": 116.48357391
}
}
},
"boost": 1.2
}
}
}
输出
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.2,
"hits": [
{
"_index": "geo_index",
"_type": "_doc",
"_id": "1JC14HYByj_ONITHikiw",
"_score": 1.2,
"_source": {
"location": {
"lon": "116.488781",
"lat": "39.950565"
},
"price": "4000",
"name": "朝阳公园 两室一厅 12m"
}
}
]
}
}
5.找出数据,并按照距离排序
文档:https://www.elastic.co/guide/cn/elasticsearch/guide/current/sorting-by-distance.html
{
"query": {
"constant_score": {
"filter": {
"geo_distance": {
"distance": "10km",
"location": {
"lat": 39.93869837,
"lon": 116.48357391
}
}
},
"boost": 1.2
}
},
"sort": {
"_geo_distance": {
"location": [
{
"lat": 39.93869837,
"lon": 116.48357391
}
],
"unit": "km",
"distance_type": "arc",
"order": "asc"
}
}
}
6.根据附近租房和价格查找数据
我更偏向距离更近,因此将权重调高
参考:https://www.elastic.co/guide/cn/elasticsearch/guide/current/decay-functions.html#CO119-4
{
"query": {
"function_score": {
"query": {
"range":{
"price":{
"gte":2000,
"lte":5000
}
}
},
"functions": [
{
"gauss": {
"location": {
"origin": {
"lon": "116.47464752",
"lat": "39.94606859"
},
"offset": "100m",
"scale": "1000m"
}
},
"weight":2.0
},
{
"gauss": {
"price": {
"origin": 3000,
"offset": 100,
"scale":500
}
}
}
],
"score_mode": "sum",
"boost_mode": "replace"
}
}
}
结果:
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4,
"relation": "eq"
},
"max_score": 0.7460326,
"hits": [
{
"_index": "geo_index",
"_type": "_doc",
"_id": "95A14XYByj_ONITHg0if",
"_score": 0.7460326,
"_source": {
"location": {
"lon": "116.47155762",
"lat": "39.9523853"
},
"price": "3500",
"name": "亮马桥 两室一厅 12m"
}
},
{
"_index": "geo_index",
"_type": "_doc",
"_id": "1JC14HYByj_ONITHikiw",
"_score": 0.36586136,
"_source": {
"location": {
"lon": "116.488781",
"lat": "39.950565"
},
"price": "4000",
"name": "朝阳公园 两室一厅 12m"
}
},
{
"_index": "geo_index",
"_type": "_doc",
"_id": "1ZC34HYByj_ONITHRkht",
"_score": 5.823735e-39,
"_source": {
"location": {
"lon": "116.341316",
"lat": "39.948795"
},
"price": "3700",
"name": "北京动物园 三室一厅 19m"
}
},
{
"_index": "geo_index",
"_type": "_doc",
"_id": "1pC44HYByj_ONITHAkgJ",
"_score": 0,
"_source": {
"location": {
"lon": "116.327805",
"lat": "39.900988"
},
"price": "2400",
"name": "北京西站 三室一厅 9m"
}
}
]
}
}
7.有些场景需要根据配置参数值进行排序,例如在所有手机中xiaomi手机得分最高?
function_score结合scrit_score排序
{
"query": {
"function_score": {
"query": {
"match_all":{}
},
"functions": [
{
"script_score": {
"script": {
"lang": "painless",
"params": {
"brand": "xiaomi"
},
"source": "if(doc['brand.keyword'].size() == 0)return 0f; String brandStr = doc['brand.keyword'].value ?: new String();if(params.brand.compareTo(brandStr) == 0){return 1f}return 0"
}
}
}
],
"score_mode":"sum",
"boost_mode":"replace"
}
}
}
score_mode定义的是如何将各个function的分值合并成一个综合的分值; boost_mode则定义如何将这个综合的分值作用在原始query产生的分值上
8.bm25相似度调优,禁用归一化
BM25:bm25提供两个调参因子
k1:k1 这个参数控制着词频结果在词频饱和度中的上升速度。默认值为 1.2 。值越小饱和度变化越快,值越大饱和度变化越慢。词频饱和度可以参看下面官方文档的截图,图中反应了词频对应的得分曲线,k1 控制 tf of BM25 这条曲线。
b:这个参数控制着字段长归一值所起的作用, 0.0 会禁用归一化, 1.0 会启用完全归一化。默认值为 0.75
- mapping设置
{
"settings": {
"index": {
"number_of_shards": "1",
"provided_name": "my_sim_index",
"similarity": {
"cbm25": {
"type": "BM25",
"b": "0"
}
},
"creation_date": "1610181315498",
"number_of_replicas": "1",
"uuid": "V8NhMRofQRu-oPFt6hheWA",
"version": {
"created": "7070099"
}
}
},
"mappings": {
"_doc": {
"properties": {
"body": {
"similarity": "BM25",
"type": "text"
},
"title": {
"similarity": "cbm25",
"type": "text"
}
}
}
}
}
- 数据准备
{
"title": "Elasticsearch allows you to configure a scoring algorithm or similarity per field. The similarity setting provides a simple way of choosing a similarity algorithm other than the default BM25, such as TF/IDF.",
"body": "Elasticsearch allows you to configure a scoring algorithm or similarity per field. The similarity setting provides a simple way of choosing a similarity algorithm other than the default BM25, such as TF/IDF."
}
{
"title": "A simple boolean similarity, which is used when full-text ranking is not needed and the score should only be based on whether the query terms match or not. Boolean similarity gives terms a score equal to their query boost.",
"body": "A simple boolean similarity, which is used when full-text ranking is not needed and the score should only be based on whether the query terms match or not. Boolean similarity gives terms a score equal to their query boost."
}
{
"title": "or similarity per field. The similarity setting provides a simple way of choosing a similarity",
"body": "or similarity per field. The similarity setting provides a simple way of choosing a similarity"
}
- 搜索
title用两cbm25忽略文档长度归一化,搜索结果与文档长度无关
GET my_sim_index/_search
{
"query":{
"match":{
"title":"similarity"
}
}
}
输出:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 0.20983505,
"hits": [
{
"_index": "my_sim_index",
"_type": "_doc",
"_id": "nZBO5nYByj_ONITHhknJ",
"_score": 0.20983505,
"_source": {
"title": "Elasticsearch allows you to configure a scoring algorithm or similarity per field. The similarity setting provides a simple way of choosing a similarity algorithm other than the default BM25, such as TF/IDF.",
"body": "Elasticsearch allows you to configure a scoring algorithm or similarity per field. The similarity setting provides a simple way of choosing a similarity algorithm other than the default BM25, such as TF/IDF."
}
},
{
"_index": "my_sim_index",
"_type": "_doc",
"_id": "oZBW5nYByj_ONITHkEli",
"_score": 0.20983505,
"_source": {
"title": "or similarity per field. The similarity setting provides a simple way of choosing a similarity",
"body": "or similarity per field. The similarity setting provides a simple way of choosing a similarity"
}
},
{
"_index": "my_sim_index",
"_type": "_doc",
"_id": "npBP5nYByj_ONITHK0mo",
"_score": 0.18360566,
"_source": {
"title": "A simple boolean similarity, which is used when full-text ranking is not needed and the score should only be based on whether the query terms match or not. Boolean similarity gives terms a score equal to their query boost.",
"body": "A simple boolean similarity, which is used when full-text ranking is not needed and the score should only be based on whether the query terms match or not. Boolean similarity gives terms a score equal to their query boost."
}
}
]
}
}
0.20983505得分相同,尽管文档长度不一样
利用body搜索:
GET my_sim_index/_search
{
"query":{
"match":{
"body":"similarity"
}
}
}
可以看出最后虽然都命中similary两次但是会受到文档长度影响
9.query_string使用:
{
"query":{
"query_string":{
"query":"(title:red)^1.0 AND (brand:iphone)"
}
}
}
10.黄桃、罐头badcase-命中黄桃和罐头商品排在前面,没有完全命中排在后面解决方案
方案一:利用contant_score
添加一个忽略TFIDF得分并且自定义得分的查询过滤器用来给完全命中的商品排在前面
"should": [
{
"constant_score": {
"filter": {
"query_string": {
"query": "allWord:(+(黄桃) AND +(罐头))"
}
},
"boost": 500
}
}
]
方案二
在原function_score
查询语句下的functions里面添加过滤器并添加权重
"function_score" : {
"query" : {
"bool" : {
"must" : [
{
"query_string" : {
"query" : "(title:(+(黄桃 罐头))^2.4 OR catBrand:(+(黄桃 罐头))^0.6 OR facet:(+(黄桃 罐头))^0.6 OR allWord:(+(黄桃 罐头))^0.0)",
"fields" : [ ],
"use_dis_max" : true,
"tie_breaker" : 0.0,
"default_operator" : "or",
"auto_generate_phrase_queries" : false,
"max_determinized_states" : 10000,
"enable_position_increments" : true,
"fuzziness" : "AUTO",
"fuzzy_prefix_length" : 0,
"fuzzy_max_expansions" : 50,
"phrase_slop" : 0,
"escape" : false,
"split_on_whitespace" : true,
"boost" : 1.0
}
}
],
"filter" : [
{
"term" : {
"skuDocType" : {
"value" : 1,
"boost" : 1.0
}
}
},
{
"bool" : {
"must_not" : [
{
"term" : {
"spMask" : {
"value" : 1,
"boost" : 1.0
}
}
}
],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
}
],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
},
"functions" : [
{
"filter": {
"query_string": {
"query":"allWord:(黄桃 AND 罐头)"
}
},
"weight":400
},
{
"filter" : {
"match_all" : {
"boost" : 1.0
}
},
"script_score" : {
"script" : {
"id" : "osop_score_script",
"lang" : "painless",
"params" : {
"catSearch" : false,
"fakeCat" : "cat16035591",
"weight" : true,
"topSku" : {
"pop8013634719" : 300.0,
"1130765898" : 300.0
},
"hotCatIds" : {
"cat16035591" : 0.9666818804198996
}
}
}
}
}
],
"score_mode" : "sum",
"boost_mode" : "sum",
"max_boost" : 3.4028235E38,
"boost" : 1.0
}
监控
_stats索引监控
Elasticsearch Index Monitoring(索引监控)之Index Stats API详解
请求方式:
GET 索引名/_stats
参数解释:
1 { 2 "_nodes": {3 "total": 1,
4 "successful": 1,
5 "failed": 0
6 },
7 "cluster_name": "ELKTEST",
8 "nodes": {
9 "lnlHC8yERCKXCuAc_2DPCQ": {
10 "timestamp": 1534242595995,
11 "name": "OPS01-ES01",
12 "transport_address": "10.9.125.148:9300",
13 "host": "10.9.125.148",
14 "ip": "10.9.125.148:9300",
15 "roles": [
16 "master",
17 "data",
18 "ingest"
19 ],
20 "attributes": {
21 "ml.machine_memory": "8203104256",
22 "xpack.installed": "true",
23 "ml.max_open_jobs": "20",
24 "ml.enabled": "true"
25 },
26 "indices": {
27 "docs": {
28 "count": 8111612, # 显示节点上有多少文档
29 "deleted": 16604 # 有多少已删除的文档还未从数据段中删除
30 },
31 "store": {
32 "size_in_bytes": 2959876263 # 显示该节点消耗了多少物理存储
33 },
34 "indexing": { #表示索引文档的次数,这个是通过一个计数器累加计数的。当文档被删除时,它不会减少。注意这个值永远是递增的,发生在内部索引数据的时候,包括那些更新操作
35 "index_total": 17703152,
36 "index_time_in_millis": 2801934,
37 "index_current": 0,
38 "index_failed": 0,
39 "delete_total": 46242,
40 "delete_time_in_millis": 2130,
41 "delete_current": 0,
42 "noop_update_total": 0,
43 "is_throttled": false,
44 "throttle_time_in_millis": 0 # 这个值高的时候,说明磁盘流量设置太低
45 },
46 "get": {
47 "total": 185179,
48 "time_in_millis": 22341,
49 "exists_total": 185178,
50 "exists_time_in_millis": 22337,
51 "missing_total": 1,
52 "missing_time_in_millis": 4,
53 "current": 0
54 },
55 "search": {
56 "open_contexts": 0, # 主动检索的次数,
57 "query_total": 495447, # 查询总数
58 "query_time_in_millis": 298344, # 节点启动到此查询消耗总时间, query_time_in_millis / query_total的比值可以作为你的查询效率的粗略指标。比值越大,每个查询用的时间越多,你就需要考虑调整或者优化。
59 "query_current": 0, #后面关于fetch的统计,是描述了查询的第二个过程(也就是query_the_fetch里的fetch)。fetch花的时间比query的越多,表示你的磁盘很慢,或者你要fetch的的文档太多。或者你的查询参数分页条件太大,(例如size等于1万
60 "fetch_total": 130194,
61 "fetch_time_in_millis": 51211,
62 "fetch_current": 0,
63 "scroll_total": 22,
64 "scroll_time_in_millis": 2196665,
65 "scroll_current": 0,
66 "suggest_total": 0,
67 "suggest_time_in_millis": 0,
68 "suggest_current": 0
69 },
70 "merges": { # 包含lucene段合并的信息,它会告诉你有多少段合并正在进行,参与的文档数,这些正在合并的段的总大小,以及花在merge上的总时间。 如果你的集群写入比较多,这个merge的统计信息就很重要。merge操作会消耗大量的磁盘io和cpu资源。如果你的索引写入很多,你会看到大量的merge操作
71 "current": 0,
72 "current_docs": 0,
73 "current_size_in_bytes": 0,
..