[elasticsearch笔记] Analysis - Analyzer

最新推荐文章于 2024-09-03 19:29:49 发布

原创最新推荐文章于 2024-09-03 19:29:49 发布 · 862 阅读

0 ·

CC 4.0 BY-SA版权

elasticsearch 专栏收录该内容

68 篇文章

订阅专栏

本文详细介绍了Elasticsearch的Analyzer，包括其在索引和搜索过程中的重要性，分析器的组成：character filters、tokenizer和token filters。重点讲解了各种Analyzer如custom、standard、simple、whitespace、stop、keyword、pattern和fingerprint的用法和特点。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

文章目录

note

索引和搜索过程的 analyzer 应该保持一致
analyzer包含：character fitlers, tokenizers, and token filters
character fitlers: 单词转化，过滤。
analyzer有0到多个character filters，按顺序执行
tokenizer：把输入处理成多个单词 term（单词），还负责记录 term 的位置信息。
analyzer必须有一个 tokenizer
Token filters：负责增、删、改 tokens。lowercase token filter：转化为小写； stop token filter：删除stop words；synonym token filter：增加同义词。toker filters 不会改变字符的偏移等信息。
analyzer 有0或者多个token filters
Analyzers

demo

custom

GET analyzer_index/_mapping

POST _analyze
{
  "analyzer": "whitespace",
  "text":     "The quick brown fox."
}

POST _analyze
{
  "tokenizer": "standard",
  "filter":  [ "lowercase", "asciifolding" ],
  "text":      "Is this déja vu?"
}

DELETE analyzer_index
#
# self define Custom analyzers
#
PUT analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "std_folded": { 
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "asciifolding"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "my_text": {
        "type": "text",
        "analyzer": "std_folded" 
      }
    }
  }
}

GET analyzer_index/_analyze 
{
  "analyzer": "std_folded", 
  "text":     "Is this déjà vu?"
}

GET analyzer_index/_analyze 
{
  "field": "my_text", 
  "text":  "Is this déjà vu?"
}


DELETE custom_analyzer_index
PUT custom_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": {
          "type":      "custom", 
          "tokenizer": "standard",
          "char_filter": [
            "html_strip"
          ],
          "filter": [
            "lowercase",
            "asciifolding"
          ]
        }
      }
    }
  }
}

POST custom_analyzer_index/_analyze
{
  "analyzer": "my_custom_analyzer",
  "text": "Is this <b>déjà vu</b>?"
}

PUT custom_analyzer_index_v1
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": {
          "type": "custom",
          "char_filter": [
            "emoticons" 
          ],
          "tokenizer": "punctuation", 
          "filter": [
            "lowercase",
            "english_stop" 
          ]
        }
      },
      "tokenizer": {
        "punctuation": { 
          "type": "pattern",
          "pattern": "[ .,!?]"
        }
      },
      "char_filter": {
        "emoticons": { 
          "type": "mapping",
          "mappings": [
            ":) => _happy_",
            ":( => _sad_"
          ]
        }
      },
      "filter": {
        "english_stop": { 
          "type": "stop",
          "stopwords": "_english_"
        }
      }
    }
  }
}

POST custom_analyzer_index_v1/_analyze
{
  "analyzer": "my_custom_analyzer",
  "text":     "I'm a :) person, and you?"
}

standard

参数： max_token_length, stopwords, stopwords_path


PUT analyzer_index_v1
{
  "settings": {
    "analysis": {
      "analyzer": {
        "std_english": { 
          "type":      "standard",
          "stopwords": "_english_"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "my_text": {
        "type":     "text",
        "analyzer": "standard", 
        "fields": {
          "english": {
            "type":     "text",
            "analyzer": "std_english" 
          }
        }
      }
    }
  }
}

POST analyzer_index_v1/_analyze
{
  "field": "my_text", 
  "text": "The old brown cow"
}

POST analyzer_index_v1/_analyze
{
  "field": "my_text.english", 
  "text": "The old brown cow"
}

PUT standard_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_english_analyzer": {
          "type": "standard",
          "max_token_length": 5,
          "stopwords": "_english_"
        }
      }
    }
  }
}

POST standard_analyzer_index/_analyze
{
  "analyzer": "my_english_analyzer",
  "text": "The 2 QUICK testing jumped over the lazy dog's bone."
}

PUT /standard_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_standard": {
          "tokenizer": "standard",
          "filter": [
            "lowercase"       
          ]
        }
      }
    }
  }
}

simple

POST _analyze
{
  "analyzer": "simple",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT /simple_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_simple": {
          "tokenizer": "lowercase",
          "filter": [         
          ]
        }
      }
    }
  }
}

whitespace

POST _analyze
{
  "analyzer": "whitespace",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT /whitespace_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_whitespace": {
          "tokenizer": "whitespace",
          "filter": [         
          ]
        }
      }
    }
  }
}

stop

POST _analyze
{
  "analyzer": "stop",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT stop_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_stop_analyzer": {
          "type": "stop",
          "stopwords": ["the", "over"]
        }
      }
    }
  }
}

POST stop_analyzer_index/_analyze
{
  "analyzer": "my_stop_analyzer",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

keyword

POST _analyze
{
  "analyzer": "keyword",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT /keyword_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_keyword": {
          "tokenizer": "keyword",
          "filter": [         
          ]
        }
      }
    }
  }
}

pattern

default: \W+ (all non-word characters)

POST _analyze
{
  "analyzer": "pattern",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT pattern_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_email_analyzer": {
          "type":      "pattern",
          "pattern":   "\\W|_", 
          "lowercase": true
        }
      }
    }
  }
}

POST pattern_analyzer_index/_analyze
{
  "analyzer": "my_email_analyzer",
  "text": "John_Smith@foo-bar.com"
}

#
# pattern: CamelCase tokenizer
# L: letter, Lu: uppercase Letter
#  ([^\p{L}\d]+)                 # swallow non letters and numbers,
# | (?<=\D)(?=\d)                 # or non-number followed by number,
# | (?<=\d)(?=\D)                 # or number followed by non-number,
# | (?<=[ \p{L} && [^\p{Lu}]])    # or lower case(letter but no uppercaes letter)
#  (?=\p{Lu})                    #   followed by upper case,
# | (?<=\p{Lu})                   # or upper case
#  (?=\p{Lu}                     #   followed by upper case
#     [\p{L}&&[^\p{Lu}]]          #   then lower case
#  )
PUT pattern_analyzer_camel_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "camel": {
          "type": "pattern",
          "pattern": "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
        }
      }
    }
  }
}

GET pattern_analyzer_camel_index/_analyze
{
  "analyzer": "camel",
  "text": "MooseX::FTPClass2_beta"
}

PUT /pattern_example
{
  "settings": {
    "analysis": {
      "tokenizer": {
        "split_on_non_word": {
          "type":       "pattern",
          "pattern":    "\\W+" 
        }
      },
      "analyzer": {
        "rebuilt_pattern": {
          "tokenizer": "split_on_non_word",
          "filter": [
            "lowercase"       
          ]
        }
      }
    }
  }
}

fingerprint

Input text is lowercased, normalized to remove extended characters, sorted, deduplicated and concatenated into a single token.

POST _analyze
{
  "analyzer": "fingerprint",
  "text": "Yes yes, Gödel said this sentence is consistent and."
}

PUT fingerprint_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_fingerprint_analyzer": {
          "type": "fingerprint",
          "stopwords": "_english_"
        }
      }
    }
  }
}

POST fingerprint_analyzer_index/_analyze
{
  "analyzer": "my_fingerprint_analyzer",
  "text": "Yes yes, Gödel said this sentence is consistent and."
}

PUT /fingerprint_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_fingerprint": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "asciifolding",
            "fingerprint"
          ]
        }
      }
    }
  }
}