Lucence自定义分词器

package org.lucene.util;


import java.io.Reader;
import java.util.Set;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;


/**
 * 停用词分词器
 * @author 
 *
 */
public class MyStopAnalyzer extends Analyzer {
@SuppressWarnings("rawtypes")
private Set stops;
@SuppressWarnings("unchecked")
public MyStopAnalyzer(String[] sws) {
//会自动将字符串数组转换为Set
stops = StopFilter.makeStopSet(Version.LUCENE_4_9, sws, true);
//将原有的停用词加入到现有的停用词中
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}

public MyStopAnalyzer() {
//获取原有的停用词
stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}


/**
public TokenStream tokenStream(String fieldName, Reader reader) {
//为这个分词器设置过滤链和Tokenizer
//使用的Tokenizer是 LetterTokenizer,先经过过滤器LowerCaseFilter(转换为小写的过滤器),再经过过滤器StopFilter(停用词过滤器)
return new StopFilter(Version.LUCENE_4_9,
  new LowerCaseFilter(Version.LUCENE_4_9, 
  new LetterTokenizer(Version.LUCENE_4_9,reader)), new CharArraySet(Version.LUCENE_4_9,stops,true));
}
    */
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
//创建Tokenizer
Tokenizer  tokenizer=new LetterTokenizer(Version.LUCENE_4_9,reader);
//创建过滤器链,先经过过滤器LowerCaseFilter(转换为小写的过滤器),再经过过滤器StopFilter(停用词过滤器)
TokenStream ts= new StopFilter(Version.LUCENE_4_9,
  new LowerCaseFilter(Version.LUCENE_4_9, tokenizer),new CharArraySet(Version.LUCENE_4_9,stops,true));
//创建TokenStreamComponents
TokenStreamComponents  tscs=new TokenStreamComponents(tokenizer,ts);
return tscs;
}



@Test
public void test04() {
Analyzer a1 = new MyStopAnalyzer(new String[]{"I","you","hate","how"});
Analyzer a2 = new MyStopAnalyzer();
String txt = "how are you thank you I hate you";
AnalyzerUtils.displayToken(txt, a1);
AnalyzerUtils.displayToken(txt, a2);
}


}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值