[elasticsearch笔记] Analysis - Analyzer

本文详细介绍了Elasticsearch的Analyzer,包括其在索引和搜索过程中的重要性,分析器的组成:character filters、tokenizer和token filters。重点讲解了各种Analyzer如custom、standard、simple、whitespace、stop、keyword、pattern和fingerprint的用法和特点。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

note

  • 索引 和 搜索过程的 analyzer 应该保持一致
  • analyzer包含:character fitlers, tokenizers, and token filters
  • character fitlers: 单词转化,过滤。
  • analyzer有0到多个character filters,按顺序执行
  • tokenizer: 把输入处理成多个单词 term(单词),还负责记录 term 的位置信息。
  • analyzer必须有一个 tokenizer
  • Token filters: 负责 增、删、改 tokens。lowercase token filter:转化为小写; stop token filter: 删除stop words;synonym token filter: 增加同义词。toker filters 不会改变字符的偏移等信息。
  • analyzer 有0或者多个token filters
  • Analyzers

demo

custom

GET analyzer_index/_mapping

POST _analyze
{
  "analyzer": "whitespace",
  "text":     "The quick brown fox."
}

POST _analyze
{
  "tokenizer": "standard",
  "filter":  [ "lowercase", "asciifolding" ],
  "text":      "Is this déja vu?"
}

DELETE analyzer_index
#
# self define Custom analyzers
#
PUT analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "std_folded": { 
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "asciifolding"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "my_text": {
        "type": "text",
        "analyzer": "std_folded" 
      }
    }
  }
}

GET analyzer_index/_analyze 
{
  "analyzer": "std_folded", 
  "text":     "Is this déjà vu?"
}

GET analyzer_index/_analyze 
{
  "field": "my_text", 
  "text":  "Is this déjà vu?"
}


DELETE custom_analyzer_index
PUT custom_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": {
          "type":      "custom", 
          "tokenizer": "standard",
          "char_filter": [
            "html_strip"
          ],
          "filter": [
            "lowercase",
            "asciifolding"
          ]
        }
      }
    }
  }
}

POST custom_analyzer_index/_analyze
{
  "analyzer": "my_custom_analyzer",
  "text": "Is this <b>déjà vu</b>?"
}

PUT custom_analyzer_index_v1
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": {
          "type": "custom",
          "char_filter": [
            "emoticons" 
          ],
          "tokenizer": "punctuation", 
          "filter": [
            "lowercase",
            "english_stop" 
          ]
        }
      },
      "tokenizer": {
        "punctuation": { 
          "type": "pattern",
          "pattern": "[ .,!?]"
        }
      },
      "char_filter": {
        "emoticons": { 
          "type": "mapping",
          "mappings": [
            ":) => _happy_",
            ":( => _sad_"
          ]
        }
      },
      "filter": {
        "english_stop": { 
          "type": "stop",
          "stopwords": "_english_"
        }
      }
    }
  }
}

POST custom_analyzer_index_v1/_analyze
{
  "analyzer": "my_custom_analyzer",
  "text":     "I'm a :) person, and you?"
}

standard

  • 参数: max_token_length, stopwords, stopwords_path

PUT analyzer_index_v1
{
  "settings": {
    "analysis": {
      "analyzer": {
        "std_english": { 
          "type":      "standard",
          "stopwords": "_english_"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "my_text": {
        "type":     "text",
        "analyzer": "standard", 
        "fields": {
          "english": {
            "type":     "text",
            "analyzer": "std_english" 
          }
        }
      }
    }
  }
}

POST analyzer_index_v1/_analyze
{
  "field": "my_text", 
  "text": "The old brown cow"
}

POST analyzer_index_v1/_analyze
{
  "field": "my_text.english", 
  "text": "The old brown cow"
}

PUT standard_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_english_analyzer": {
          "type": "standard",
          "max_token_length": 5,
          "stopwords": "_english_"
        }
      }
    }
  }
}

POST standard_analyzer_index/_analyze
{
  "analyzer": "my_english_analyzer",
  "text": "The 2 QUICK testing jumped over the lazy dog's bone."
}

PUT /standard_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_standard": {
          "tokenizer": "standard",
          "filter": [
            "lowercase"       
          ]
        }
      }
    }
  }
}

simple

POST _analyze
{
  "analyzer": "simple",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT /simple_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_simple": {
          "tokenizer": "lowercase",
          "filter": [         
          ]
        }
      }
    }
  }
}

whitespace

POST _analyze
{
  "analyzer": "whitespace",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT /whitespace_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_whitespace": {
          "tokenizer": "whitespace",
          "filter": [         
          ]
        }
      }
    }
  }
}

stop

POST _analyze
{
  "analyzer": "stop",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT stop_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_stop_analyzer": {
          "type": "stop",
          "stopwords": ["the", "over"]
        }
      }
    }
  }
}

POST stop_analyzer_index/_analyze
{
  "analyzer": "my_stop_analyzer",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

keyword

POST _analyze
{
  "analyzer": "keyword",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT /keyword_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_keyword": {
          "tokenizer": "keyword",
          "filter": [         
          ]
        }
      }
    }
  }
}

pattern

  • default: \W+ (all non-word characters)
POST _analyze
{
  "analyzer": "pattern",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT pattern_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_email_analyzer": {
          "type":      "pattern",
          "pattern":   "\\W|_", 
          "lowercase": true
        }
      }
    }
  }
}

POST pattern_analyzer_index/_analyze
{
  "analyzer": "my_email_analyzer",
  "text": "John_Smith@foo-bar.com"
}

#
# pattern: CamelCase tokenizer
# L: letter, Lu: uppercase Letter
#  ([^\p{L}\d]+)                 # swallow non letters and numbers,
# | (?<=\D)(?=\d)                 # or non-number followed by number,
# | (?<=\d)(?=\D)                 # or number followed by non-number,
# | (?<=[ \p{L} && [^\p{Lu}]])    # or lower case(letter but no uppercaes letter)
#  (?=\p{Lu})                    #   followed by upper case,
# | (?<=\p{Lu})                   # or upper case
#  (?=\p{Lu}                     #   followed by upper case
#     [\p{L}&&[^\p{Lu}]]          #   then lower case
#  )
PUT pattern_analyzer_camel_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "camel": {
          "type": "pattern",
          "pattern": "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
        }
      }
    }
  }
}

GET pattern_analyzer_camel_index/_analyze
{
  "analyzer": "camel",
  "text": "MooseX::FTPClass2_beta"
}

PUT /pattern_example
{
  "settings": {
    "analysis": {
      "tokenizer": {
        "split_on_non_word": {
          "type":       "pattern",
          "pattern":    "\\W+" 
        }
      },
      "analyzer": {
        "rebuilt_pattern": {
          "tokenizer": "split_on_non_word",
          "filter": [
            "lowercase"       
          ]
        }
      }
    }
  }
}

fingerprint

  • Input text is lowercased, normalized to remove extended characters, sorted, deduplicated and concatenated into a single token.
POST _analyze
{
  "analyzer": "fingerprint",
  "text": "Yes yes, Gödel said this sentence is consistent and."
}

PUT fingerprint_analyzer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_fingerprint_analyzer": {
          "type": "fingerprint",
          "stopwords": "_english_"
        }
      }
    }
  }
}

POST fingerprint_analyzer_index/_analyze
{
  "analyzer": "my_fingerprint_analyzer",
  "text": "Yes yes, Gödel said this sentence is consistent and."
}

PUT /fingerprint_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_fingerprint": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "asciifolding",
            "fingerprint"
          ]
        }
      }
    }
  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值