lucene版本3.1.0
分词工具英文版的是用标准版的,即StandardAnalyzer
分词工具英文版的是用标准版的,即StandardAnalyzer
中文分词是用SmartChineseAnalyzer。
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.junit.Test;
public class Analyzertest {
//Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_31);
//String text = "我是中国人";
String text = "IndexWriter javadoc a apach2.0.txt";
@Test
public void test ()
{
try {
analyzer(analyzer,text);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void analyzer(Analyzer al, String text) throws Exception
{
TokenStream tokeStream = al.tokenStream("content", new StringReader(text));
//TermAttribute 已过时,文档中推荐使用CharTermAttribute
tokeStream.addAttribute(CharTermAttribute.class);
while(tokeStream.incrementToken())
{
CharTermAttribute ta = tokeStream.getAttribute(CharTermAttribute.class);
System.out.println(ta.toString());
//System.out.println(tokeStream.toString());
}
}
}