Elasticsearch-IK分词器源码学习01
一、说明
本文是简单介绍IK分词器的源码,整体的结构,不涉及详细具体的源码分析,请知晓。
1、源码参考
地址:git@github.com:medcl/elasticsearch-analysis-ik.git
tag为:5.3.2
2、IDE
Intellij idea+Maven 3.0
3、Demo
package iktest;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.elasticsearch.common.settings.Settings;
import org.junit.Test;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.lucene.IKTokenizer;
import java.io.IOException;
import java.io.StringReader;
import java.util.concurrent.ConcurrentHashMap;
public class IKTestOne {
public final String TEXT_Chinese = "上海迪士尼乐园";
@Test
public void test1() throws IOException {
Settings settings = Settings.EMPTY;
Configuration conf = new Configuration(null, settings);
//use ik_smart
conf.setUseSmart(true);
IKTokenizer tokenizer = new IKTokenizer(conf);
tokenizer.setReader(new StringReader(TEXT_Chinese));
tokenizer.reset();
CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute postIncr = tokenizer.addAttribute(PositionIncrementAttribute.class);
int position = 0;
while (tokenizer.incrementToken