代码:
第一步:创建自己的分词器
public final class MyStopAnalyzer extends Analyzer { //继承Analyzer这个类
private Set stops;
public MyStopAnalyzer(String[] sws) { //构造方法
stops=StopFilter.makeStopSet(Version.LUCENE_35,sws,true);
}
public final TokenStream tokenStream(String s, Reader reader) {
return new StopFilter(Version.LUCENE_35, //过滤停用词
new LowerCaseFilter(Version.LUCENE_35, //过滤成小写
new LetterTokenizer(Version.LUCENE_35,reader)),stops); //转换成词汇
}
}
第二步:把读出的TokenStream显示出来。
package com.lk;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.io.IOException;
import java.io.StringReader;
/**
* Created by LK on 2016/12/24.
*/
public class AnalyzerUtils {
public static void displayAllTokenInfo(String str, Analyzer a) {
TokenStream stream = a.tokenStream("content",new StringReader(str)) ;
PositionIncrementAttribute pia= stream.addAttribute(PositionIncrementAttribute.class); // 保存词与词之间的位置增量
OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class); // 保存各个词汇之间的偏移量
CharTermAttribute cta =stream.addAttribute(CharTermAttribute.class); // 词汇属性 保存相应词汇
TypeAttribute ta = stream.addAttribute(TypeAttribute.class); //类型属性
try {
for (;stream.incrementToken();){
System.out.println(pia.getPositionIncrement()+":");
System.out.println(cta+"["+oa.startOffset()+"-"+oa.endOffset()+"]-->"+ta.type()+"\n");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
第三步:进行测试
public class TestMyAnalyzer {
@Test
public void test001() {
Analyzer a1 = new MyStopAnalyzer(new String[]{"I", "are", "you" ,"me","is"}); //定义自己要停用的词
String str = "how are you my name is lk thanks you i hope you like me";
AnalyzerUtils.displayAllTokenInfo(str,a1);
}
}
第四步:测试结果
1:
how[0-3]-->word
3:
my[13-15]-->word
1:
name[16-20]-->word
2:
lk[24-26]-->word
1:
thanks[28-34]-->word
3:
hope[42-46]-->word
2:
like[51-55]-->word
把StopAnalyzer的停用词加进去:
package com.lk;
import org.apache.lucene.analysis.*;
import org.apache.lucene.util.Version;
import java.io.Reader;
import java.util.Set;
/**
* Created by LK on 2016/12/24.
*/
public final class MyStopAnalyzer extends Analyzer {
private Set stops;
public MyStopAnalyzer(String[] sws) {
stops=StopFilter.makeStopSet(Version.LUCENE_35,sws,true);
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//加入进去
}
public MyStopAnalyzer() {
stops=StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
public final TokenStream tokenStream(String s, Reader reader) {
return new StopFilter(Version.LUCENE_35,
new LowerCaseFilter(Version.LUCENE_35,
new LetterTokenizer(Version.LUCENE_35,reader)),stops);
}
}