Elastic Stack传统上由三个主要组件(Elasticsearch,Logstash和Kibana)组成,早已脱离了这种组合,现在也可以与名为“ Beats”的第四个元素结合使用–一个针对不同用例的日志运送者系列。 现在网上有一种说法叫做ELKB,这里的B就是指的beats.
在集中式日志记录中,数据管道包括三个主要阶段:聚合,处理和存储。 在ELK堆栈中,传统上,前两个阶段是堆栈工作量Logstash的职责。执行这些任务需要付出一定的代价。 由于与Logstash的设计有关的内在问题,性能问题变得经常发生,尤其是在复杂的管道需要大量处理的情况下。将Logstash的部分职责外包的想法也应运而生,尤其是将数据提取任务转移到其他工具上。
实战
- 插入测试数据
POST forum_article/_bulk
{"index":{"_id":1}}
{"articleID":"article_1","userID":1,"hidden":false,"postDate":"2017-01-01"}
{"index":{"_id":2}}
{"articleID":"article_2","userID":1,"hidden":true,"postDate":"2017-01-02"}
{"index":{"_id":3}}
{"articleID":"article_3","userID":2,"hidden":false,"postDate":"2017-01-03"}
{"index":{"_id":4}}
{"articleID":"article_4","userID":2,"hidden":true,"postDate":"2017-01-04"}
POST forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"tag":["java","hadoop"]}}
{"update":{"_id":2}}
{"doc":{"tag":["java"]}}
{"update":{"_id":3}}
{"doc":{"tag":["hadoop"]}}
{"update":{"_id":4}}
{"doc":{"tag":["java","elasticsearch"]}}
POST /forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"title":"this is java,hadoop blog"}}
{"update":{"_id":2}}
{"doc":{"title":"this is java blog"}}
{"update":{"_id":3}}
{"doc":{"title":"this is hadoop blog"}}
{"update":{"_id":4}}
{"doc":{"title":"this is java,elasticsearch blog"}}
当type=text,默认会设置两个field,一个是field本身,是分词的,还有一个是field.keyword,默认不分词,最多保留256个字符。
{
"forum_article" : {
"mappings" : {
"properties" : {
"articleID" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"hidden" : {
"type" : "boolean"
},
"postDate" : {
"type" : "date"
},
"userID" : {
"type" : "long"
}
}
}
}
}
term filter/query,对搜索文本部分词,直接拿去倒排索引中匹配
POST forum_article/_search?scroll=1m
{
"query":{
"constant_score": {
"filter": {
"term": {
"userID": 1
}
},
"boost": 1.2
}
},
"size":1
}
POST forum_article/_search articleID建立倒排索引会分词,所以不分词搜索可以使用articleID.keyword
{
"query":{
"constant_score": {
"filter": {
"term": {
"articleID.keyword": "article_1"
}
},
"boost": 1.2
}
}
}
POST forum_article/_search bool组合多个filter条件
{
"query":{
"constant_score": {
"filter": {
"bool": {
"should":[
{
"term":{
"postDate":"2017-01-01"
}
},
{ "bool":{
"must":[
{
"term":{
"articleID":"article_1"
}
},
{
"term":{
"userID":1
}
}
]
}
}
]
}
},
"boost": 1.2
}
}
}
POST forum_article/_search terms搜索多个值
{
"query": {
"constant_score": {
"filter": {
"bool": {
"must":[
{
"terms":{
"tag":["java","elasticsearch"]
}
},
{
"term":{
"articleID":"article_2"
}
}
]
}
},
"boost": 1.2
}
}
}
POST forum_article/_search range范围搜索,gt,lt,gte,lte
{
"query": {
"constant_score": {
"filter": {
"bool": {
"must":[
{
"terms":{
"tag":["java","elasticsearch"]
}
},
{
"range":{
"postDate":{
"gt":"2017-01-01||-1m"
}
}
}
]
}
},
"boost": 1.2
}
}
}
POST /forum_article/_search 手动控制全文检索精准度
{
"query": {
"match": {
"title": {
"query": "java hadoop",
"minimum_should_match": "50%"
}
}
}
}
POST /forum_article/_search
{
"query": {
"match": {
"title": {
"query": "java hadoop",
"operator": "and"
}
}
}
}
POST /forum_article/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"title": "java"
}
},
{
"match": {
"title": "hadoop"
}
},
{
"match": {
"title": "elasticsearch"
}
},
{
"match": {
"title": "spark"
}
}
],
"minimum_should_match": 2
}
}
}
POST /forum_article/_search dis_max实现best field策略
{
"query": {
"dis_max": {
"tie_breaker": 0.7,
"boost": 1.2,
"queries": [
{
"bool": {
"should": [
{
"match": {
"title": "java elasticsearch"
}
},
{
"match": {
"tag": "java elasticsearch"
}
}
],
"minimum_should_match": 2
}
}
]
}
}
}
POST /forum_article/_search multi_match实现dis_max best fields策略
{
"query": {
"multi_match": {
"query": "java elasticsearch",
"fields": ["title^2","tag"],
"type": "best_fields",
"minimum_should_match": 1
}
}
}
POST forum_article/_mapping 分词气english会对document倒排索引进行时态,大小写等转换,搜索结果会被影响,所以添加一个不进行时态等转换的standard分词器的field用于搜索
{
"properties":{
"sub_title":{
"type":"text",
"analyzer":"english",
"fields":{
"std":{
"type":"text",
"analyzer":"standard"
}
}
}
}
}
POST forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"sub_title":"learning more courses"}}
{"update":{"_id":2}}
{"doc":{"sub_title":"learned a lot of course"}}
{"update":{"_id":3}}
{"doc":{"sub_title":"learning course"}}
{"update":{"_id":4}}
{"doc":{"sub_title":"hello world"}}
POST forum_article/_search 使用most_fields策略
{
"query": {
"multi_match": {
"query": "learned course",
"type": "most_fields",
"fields": ["sub_title","sub_title.std"]
}
}
}
POST forum_article/_search cross_fields+and实现要求每个term都在某个field出现
{
"query": {
"multi_match": {
"query": "liang lily",
"type": "cross_fields",
"operator": "and",
"fields": ["author_first_name","author_last_name"]
}
}
}
POST forum_article/_mapping 使用copy_to使用croll fields
{
"properties":{
"author_first_name":{
"type":"text",
"copy_to":"author_full_name"
},
"author_last_name":{
"type":"text",
"copy_to":"author_full_name"
},
"author_full_name":{
"type":"text"
}
}
}
POST forum_article/_bulk
{"update":{"_id":1}}
{"doc":{"author_first_name":"lily","author_last_name":"liang"}}
{"update":{"_id":2}}
{"doc":{"author_first_name":"peter","author_last_name":"ma"}}
{"update":{"_id":3}}
{"doc":{"author_first_name":"robbin","author_last_name":"li"}}
{"update":{"_id":4}}
{"doc":{"author_first_name":"smith","author_last_name":"zhou"}}
POST forum_article/_search match_phrase通过slop决定搜索在field中的距离
{
"query": {
"match_phrase": {
"title": "java blog"
}
}
}
POST forum_article/_search
{
"query": {
"match_phrase": {
"title": {
"query": "java blog",
"slop": 1
}
}
}
}
POST forum_article/_search 混合使用match和match_phrase实现召回率和精准度
{
"query": {
"bool": {
"must": [
{"match": {
"title": "java blog"
}}
],
"should": [
{
"match_phrase": {
"title": {
"query": "java blog"
}
}
}
]
}
}
}
POST forum_article/_search match+rescore+match_phrase
{
"query": {
"bool": {
"must": [
{"match": {
"title": "java blog"
}}
]
}
},
"rescore": {
"query": {
"rescore_query":{
"match_phrase": {
"title": "java blog"
}
}
},
"window_size": 50
}
}
POST forum_article/_search prefix前缀搜索,不计算score
{
"query": {
"bool": {
"must": [
{"prefix": {
"title": "this"
}}
]
}
}
}
POST forum_article/_search 搜索推荐
{
"query": {
"bool": {
"must": [
{"match_phrase_prefix": {
"title": {
"query":"this is ja*",
"max_expansions": 1
}
}}
]
}
}
}
PUT forum_article1/ edge_ngram
{
"settings": {
"analysis": {
"filter": {
"autocomplete_filter":{
"type":"edge_ngram",
"min_gram":3,
"max_gram":20
}
},
"analyzer": {
"autocomplete":{
"type":"custom",
"tokenizer":"standard",
"filter":["lowercase","autocomplete_filter"]
}
}
}
}
}
知识点
- term filter:根据exact value进行搜索,数字,boolean,date天然支持;text需要建索引时指定type为keyword,才能用term query。
- filter过滤执行原理
倒排索引中发现对应的document list -> 根据document list构建bitset,一个二进制数据,数组每个元素都是0或1,标识一个document对一个filter条件是否匹配 -> 遍历每个过滤条件对应的bitset,查找满足所有条件的document -> filter次数达到一定次数时,会将bitset进行cache,对于小segment(document<1000,或3%)不缓存其bitset,下次搜索不用扫描倒排索引 -> filter大部分情况下,在query之前执行,尽量过滤掉可能多的数据 -> 如果document有新增或修改,cache bitset会自动更新。 - 全文检索,使用match query,should;控制精准度:operator,minimum_should_match;match底层会转换为term+should/must。
- 多shard场景下,relevance score不准确。默认local shard去计算relevance score。
- best fields策略,某个field中匹配到了尽可能多的关键词,被排在前面,最匹配的在前面
- most fields策略,尽可能返回更多field匹配到的某个关键词的document,匹配均匀。
- cross_fields策略,
- 用copy_to,将多个字段的值复制到一个字段(隐藏字段),并建立倒排索引
- match_phrase,近似匹配,尽量让离得近的搜索词的document优先返回;term position;通过slop决定搜索在field中的距离。
- 召回率和精准度。混合使用match和match_phrase实现召回率和精准度。
- rescore。match匹配出来很多的doc,但我们一般只需要前面的几个doc,所以可以用match_phrase对结果再次对前面的几个doc进行rescore.
- 前缀搜索prefix(扫描到了一个前缀匹配的term不能停,必须继续搜索,直到扫描完整个的倒排索引),wildcard通配符搜索,regexp正则搜索;搜索性能都很差,都要搜索整个倒排索引。
- 搜索推荐,match_phrase_prefix,将最后一个term作为前缀搜索,max_expansions,指定prefix最多匹配多少个term就足够了,限定性能
- ngram,edge ngram(解决前缀搜索),index time,