note
- 索引 和 搜索过程的 analyzer 应该保持一致
- analyzer包含:character fitlers, tokenizers, and token filters
- character fitlers: 单词转化,过滤。
- analyzer有0到多个character filters,按顺序执行
- tokenizer: 把输入处理成多个单词 term(单词),还负责记录 term 的位置信息。
- analyzer必须有一个 tokenizer
- Token filters: 负责 增、删、改 tokens。lowercase token filter:转化为小写; stop token filter: 删除stop words;synonym token filter: 增加同义词。toker filters 不会改变字符的偏移等信息。
- analyzer 有0或者多个token filters
- Analyzers
demo
custom
GET analyzer_index/_mapping
POST _analyze
{
"analyzer": "whitespace",
"text": "The quick brown fox."
}
POST _analyze
{
"tokenizer": "standard",
"filter": [ "lowercase", "asciifolding" ],
"text": "Is this déja vu?"
}
DELETE analyzer_index
#
# self define Custom analyzers
#
PUT analyzer_index
{
"settings": {
"analysis": {
"analyzer": {
"std_folded": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
"mappings": {
"properties": {
"my_text": {
"type": "text",
"analyzer": "std_folded"
}
}
}
}
GET analyzer_index/_analyze
{
"analyzer": "std_folded",
"text": "Is this déjà vu?"
}
GET analyzer_index/_analyze
{
"field": "my_text",
"text": "Is this déjà vu?"
}
DELETE custom_analyzer_index
PUT custom_analyzer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"type": "custom",
"tokenizer": "standard",
"char_filter": [
"html_strip"
],
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
}
}
POST custom_analyzer_index/_analyze
{
"analyzer": "my_custom_analyzer",
"text": "Is this <b>déjà vu</b>?"
}
PUT custom_analyzer_index_v1
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"type": "custom",
"char_filter": [
"emoticons"
],
"tokenizer": "punctuation",
"filter": [
"lowercase",
"english_stop"
]
}
},
"tokenizer": {
"punctuation": {
"type": "pattern",
"pattern": "[ .,!?]"
}
},
"char_filter": {
"emoticons": {
"type": "mapping",
"mappings": [
":) => _happy_",
":( => _sad_"
]
}
},
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
}
}
}
}
}
POST custom_analyzer_index_v1/_analyze
{
"analyzer": "my_custom_analyzer",
"text": "I'm a :) person, and you?"
}
standard
- 参数: max_token_length, stopwords, stopwords_path
PUT analyzer_index_v1
{
"settings": {
"analysis": {
"analyzer": {
"std_english": {
"type": "standard",
"stopwords": "_english_"
}
}
}
},
"mappings": {
"properties": {
"my_text": {
"type": "text",
"analyzer": "standard",
"fields": {
"english": {
"type": "text",
"analyzer": "std_english"
}
}
}
}
}
}
POST analyzer_index_v1/_analyze
{
"field": "my_text",
"text": "The old brown cow"
}
POST analyzer_index_v1/_analyze
{
"field": "my_text.english",
"text": "The old brown cow"
}
PUT standard_analyzer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_english_analyzer": {
"type": "standard",
"max_token_length": 5,
"stopwords": "_english_"
}
}
}
}
}
POST standard_analyzer_index/_analyze
{
"analyzer": "my_english_analyzer",
"text": "The 2 QUICK testing jumped over the lazy dog's bone."
}
PUT /standard_analyzer_index
{
"settings": {
"analysis": {
"analyzer": {
"rebuilt_standard": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
}
}
}
}
}
simple
POST _analyze
{
"analyzer": "simple",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
PUT /simple_example
{
"settings": {
"analysis": {
"analyzer": {
"rebuilt_simple": {
"tokenizer": "lowercase",
"filter": [
]
}
}
}
}
}
whitespace
POST _analyze
{
"analyzer": "whitespace",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
PUT /whitespace_example
{
"settings": {
"analysis": {
"analyzer": {
"rebuilt_whitespace": {
"tokenizer": "whitespace",
"filter": [
]
}
}
}
}
}
stop
POST _analyze
{
"analyzer": "stop",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
PUT stop_analyzer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_stop_analyzer": {
"type": "stop",
"stopwords": ["the", "over"]
}
}
}
}
}
POST stop_analyzer_index/_analyze
{
"analyzer": "my_stop_analyzer",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
keyword
POST _analyze
{
"analyzer": "keyword",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
PUT /keyword_example
{
"settings": {
"analysis": {
"analyzer": {
"rebuilt_keyword": {
"tokenizer": "keyword",
"filter": [
]
}
}
}
}
}
pattern
- default: \W+ (all non-word characters)
POST _analyze
{
"analyzer": "pattern",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
PUT pattern_analyzer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_email_analyzer": {
"type": "pattern",
"pattern": "\\W|_",
"lowercase": true
}
}
}
}
}
POST pattern_analyzer_index/_analyze
{
"analyzer": "my_email_analyzer",
"text": "John_Smith@foo-bar.com"
}
#
# pattern: CamelCase tokenizer
# L: letter, Lu: uppercase Letter
# ([^\p{L}\d]+) # swallow non letters and numbers,
# | (?<=\D)(?=\d) # or non-number followed by number,
# | (?<=\d)(?=\D) # or number followed by non-number,
# | (?<=[ \p{L} && [^\p{Lu}]]) # or lower case(letter but no uppercaes letter)
# (?=\p{Lu}) # followed by upper case,
# | (?<=\p{Lu}) # or upper case
# (?=\p{Lu} # followed by upper case
# [\p{L}&&[^\p{Lu}]] # then lower case
# )
PUT pattern_analyzer_camel_index
{
"settings": {
"analysis": {
"analyzer": {
"camel": {
"type": "pattern",
"pattern": "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
}
}
}
}
}
GET pattern_analyzer_camel_index/_analyze
{
"analyzer": "camel",
"text": "MooseX::FTPClass2_beta"
}
PUT /pattern_example
{
"settings": {
"analysis": {
"tokenizer": {
"split_on_non_word": {
"type": "pattern",
"pattern": "\\W+"
}
},
"analyzer": {
"rebuilt_pattern": {
"tokenizer": "split_on_non_word",
"filter": [
"lowercase"
]
}
}
}
}
}
fingerprint
- Input text is lowercased, normalized to remove extended characters, sorted, deduplicated and concatenated into a single token.
POST _analyze
{
"analyzer": "fingerprint",
"text": "Yes yes, Gödel said this sentence is consistent and."
}
PUT fingerprint_analyzer_index
{
"settings": {
"analysis": {
"analyzer": {
"my_fingerprint_analyzer": {
"type": "fingerprint",
"stopwords": "_english_"
}
}
}
}
}
POST fingerprint_analyzer_index/_analyze
{
"analyzer": "my_fingerprint_analyzer",
"text": "Yes yes, Gödel said this sentence is consistent and."
}
PUT /fingerprint_example
{
"settings": {
"analysis": {
"analyzer": {
"rebuilt_fingerprint": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"fingerprint"
]
}
}
}
}
}