分词器,对文本资源进行切分,将文本按规则切分为一个个可以进行索引的最小单位(关键词),下面来介绍几种常用分词
单字分词器的结果为
(不,0,1,type=<CJ>)
(管,1,2,type=<CJ>)
(你,2,3,type=<CJ>)
(信,3,4,type=<CJ>)
(不,4,5,type=<CJ>)
(信,5,6,type=<CJ>)
(反,7,8,type=<CJ>)
(正,8,9,type=<CJ>)
(我,9,10,type=<CJ>)
(是,10,11,type=<CJ>)
(信,11,12,type=<CJ>)
(了,12,13,type=<CJ>)
二分法分词结果为
(不管,0,2,type=double)
(管你,1,3,type=double)
(你信,2,4,type=double)
(信不,3,5,type=double)
(不信,4,6,type=double)
(反正,7,9,type=double)
(正我,8,10,type=double)
(我是,9,11,type=double)
(是信,10,12,type=double)
(信了,11,13,type=double)
词典分词结果为
(管你,1,3)
(信不信,3,6)
(反正,7,9)
(我是,9,11)
(信,11,12)
package com.lamp.lucene.analyzer;
import java.io.StringReader;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.junit.Test;
public class AnalyzerTest {
//单字分词器
public Analyzer analyzer = new StandardAnalyzer();
//二分法分词
public Analyzer analyzer2 = new CJKAnalyzer();
//词典分词
public Analyzer analyzer3 = new MMAnalyzer();
public String content = "不管你信不信,反正我是信了";
@Test
public void testAnalyzer() throws Exception{
analyzer(analyzer,content);
//analyzer(analyzer2,content);
//analyzer(analyzer3,content);
}
public void analyzer(Analyzer analyzer, String text) throws Exception {
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
Token token = null;
while(null != (token=tokenStream.next())){
System.out.println(token);
}
}
}
单字分词器的结果为
(不,0,1,type=<CJ>)
(管,1,2,type=<CJ>)
(你,2,3,type=<CJ>)
(信,3,4,type=<CJ>)
(不,4,5,type=<CJ>)
(信,5,6,type=<CJ>)
(反,7,8,type=<CJ>)
(正,8,9,type=<CJ>)
(我,9,10,type=<CJ>)
(是,10,11,type=<CJ>)
(信,11,12,type=<CJ>)
(了,12,13,type=<CJ>)
二分法分词结果为
(不管,0,2,type=double)
(管你,1,3,type=double)
(你信,2,4,type=double)
(信不,3,5,type=double)
(不信,4,6,type=double)
(反正,7,9,type=double)
(正我,8,10,type=double)
(我是,9,11,type=double)
(是信,10,12,type=double)
(信了,11,13,type=double)
词典分词结果为
(管你,1,3)
(信不信,3,6)
(反正,7,9)
(我是,9,11)
(信,11,12)