package com.jiepu.lucene_49;
//lucene不同分词测试,来至:http://blog.zzzhc.com/blogs/86/
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Iterator;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.Version;
import org.lionsoul.jcseg.analyzer.JcsegAnalyzer4X;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
public class CNAnalyzerBenchmark {
public static void main(String[] args) throws IOException {
testAnalyzer(new CJKAnalyzer(Version.LUCENE_43));
IKAnalyzer ikAnalyzer = new IKAnalyzer();
testAnalyzer(ikAnalyzer);
PaodingAnalyzer paodingAnalyzer = new PaodingAnalyzer();
testAnalyzer(paodingAnalyzer);
SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer(
false);
testAnalyzer(smartChineseAnalyzer);
StandardAnalyzer standardAnalyzer = new StandardAnalyzer(
Version.LUCENE_31);
testAnalyzer(standardAnalyzer);
MMSegAnalyzer mmsegAnalyzer = new MMSegAnalyzer();
testAnalyzer(mmsegAnalyzer);
testAnalyzer(new JcsegAnalyzer4X(0));
testAnalyzer(new SimpleAnalyzer(Version.LUCENE_30));
}
static void testAnalyzer(Analyzer a) throws IOException {
String data = "中文(chinese)与西方语言最大的区别" + "就在于语句的词汇之间没有明显的分词界限,"
+ "但是计算机自然语言处理是按词汇来进行分析的," + "因此中文分词的效果直接影响中文检索和自然语言处理的准确性。";
StringBuilder ss = new StringBuilder();
for (int i = 0; i < 1; i++) {
ss.append(data);
}
String s = ss.toString();
long startTime = System.currentTimeMillis();
TokenStream stream = a.tokenStream("", new StringReader(s));
Iterator<Class<? extends Attribute>> iterator = stream
.getAttributeClassesIterator();
while (iterator.hasNext()) {
Class<? extends Attribute> attrClass = iterator.next();
System.out.println(attrClass.getSimpleName());
}
//stream = a.tokenStream("", new StringReader(s));
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
// 获取词元文本属性
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
// 获取词元文本属性
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
stream.reset();
while (stream.incrementToken()) {
System.out.print(offset.startOffset() + "-"
+ offset.endOffset() + ":" + term.toString());
}
System.out.println();
long endTime = System.currentTimeMillis();
double seconds = (endTime - startTime) / 1000.0;
System.out.println("chars=" + s.length() + ",time=" + seconds
+ "seconds" + ",speed=" + (int) (s.length() / seconds)
+ " chars/second");
stream.end();
stream.close();
}
}
lucene分词器分词demo
最新推荐文章于 2022-01-24 13:51:28 发布