分词器的组成:
Character Filter、Tokenizer、Token Filter
1. Character Filter 分成
HTML strip 去除html标签; Mapping 字符串替换; Pattern replace 正则匹配替换
2. Tokenizer 分成
whitespace/standard/uax_url_email/pattern/keyword/path hierarchy 路径分词器
3. Token Filter 分成
Lowercase/stop/synonym(近义词)
POST _analyze
{
"tokenizer": "keyword",
"char_filter": ["html_strip"],
"text": "<b>hello world</b>"
}
POST _analyze
{
"tokenizer": "standard",
"char_filter": [
{
"type":"mapping",
"mappings":[":) => happy",":( => sad"]
}
],
"text": ["I am felling :)","Feeling :( today"]
}
POST _analyze
{
"tokenizer": "standard",
"char_filter": [
{
"type":"pattern_replace",
"pattern":"http://(.*)",
"replacement":"$1"
}
],
"text": "http://www.elasti.co"
}
POST _analyze
{
"tokenizer": "path_hierarchy",
"text":"/user/ymruan/a/b"
}
PUT my_index
{
"settings":{
"analysis":{
"analyzer":{
"my_custom_analyzer":{
"type":"custom",
"char_filter":[
"mamj_char_filter"
],
"tokenizer":"mamj_tokenizer",
"filter":[
"lowercase",
"english_stop"
]
}
},
"tokenizer":{
"mamj_tokenizer":{
"type":"pattern",
"pattern":"[.,!?]"
}
},
"char_filter":{
"mamj_char_filter":{
"type":"mapping",
"mappings":[":) => happy",":( => sad"]
}
},
"filter":{
"english_stop":{
"type":"stop",
"stopwords":"_english_"
}
}
}
}
}
POST my_index/_analyze
{
"analyzer": "my_custom_analyzer",
"text": "I`m a :) person,and you?"
}