package org.lucene.util;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* 停用词分词器
* @author
*
*/
public class MyStopAnalyzer extends Analyzer {
@SuppressWarnings("rawtypes")
private Set stops;
@SuppressWarnings("unchecked")
public MyStopAnalyzer(String[] sws) {
//会自动将字符串数组转换为Set
stops = StopFilter.makeStopSet(Version.LUCENE_4_9, sws, true);
//将原有的停用词加入到现有的停用词中
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
public MyStopAnalyzer() {
//获取原有的停用词
stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
/**
public TokenStream tokenStream(String fieldName, Reader reader) {
//为这个分词器设置过滤链和Tokenizer
//使用的Tokenizer是 LetterTokenizer,先经过过滤器LowerCaseFilter(转换为小写的过滤器),再经过过滤器StopFilter(停用词过滤器)
return new StopFilter(Version.LUCENE_4_9,
new LowerCaseFilter(Version.LUCENE_4_9,
new LetterTokenizer(Version.LUCENE_4_9,reader)), new CharArraySet(Version.LUCENE_4_9,stops,true));
}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
//创建Tokenizer
Tokenizer tokenizer=new LetterTokenizer(Version.LUCENE_4_9,reader);
//创建过滤器链,先经过过滤器LowerCaseFilter(转换为小写的过滤器),再经过过滤器StopFilter(停用词过滤器)
TokenStream ts= new StopFilter(Version.LUCENE_4_9,
new LowerCaseFilter(Version.LUCENE_4_9, tokenizer),new CharArraySet(Version.LUCENE_4_9,stops,true));
//创建TokenStreamComponents
TokenStreamComponents tscs=new TokenStreamComponents(tokenizer,ts);
return tscs;
}
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* 停用词分词器
* @author
*
*/
public class MyStopAnalyzer extends Analyzer {
@SuppressWarnings("rawtypes")
private Set stops;
@SuppressWarnings("unchecked")
public MyStopAnalyzer(String[] sws) {
//会自动将字符串数组转换为Set
stops = StopFilter.makeStopSet(Version.LUCENE_4_9, sws, true);
//将原有的停用词加入到现有的停用词中
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
public MyStopAnalyzer() {
//获取原有的停用词
stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
/**
public TokenStream tokenStream(String fieldName, Reader reader) {
//为这个分词器设置过滤链和Tokenizer
//使用的Tokenizer是 LetterTokenizer,先经过过滤器LowerCaseFilter(转换为小写的过滤器),再经过过滤器StopFilter(停用词过滤器)
return new StopFilter(Version.LUCENE_4_9,
new LowerCaseFilter(Version.LUCENE_4_9,
new LetterTokenizer(Version.LUCENE_4_9,reader)), new CharArraySet(Version.LUCENE_4_9,stops,true));
}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
//创建Tokenizer
Tokenizer tokenizer=new LetterTokenizer(Version.LUCENE_4_9,reader);
//创建过滤器链,先经过过滤器LowerCaseFilter(转换为小写的过滤器),再经过过滤器StopFilter(停用词过滤器)
TokenStream ts= new StopFilter(Version.LUCENE_4_9,
new LowerCaseFilter(Version.LUCENE_4_9, tokenizer),new CharArraySet(Version.LUCENE_4_9,stops,true));
//创建TokenStreamComponents
TokenStreamComponents tscs=new TokenStreamComponents(tokenizer,ts);
return tscs;
}
@Test
public void test04() {
Analyzer a1 = new MyStopAnalyzer(new String[]{"I","you","hate","how"});
Analyzer a2 = new MyStopAnalyzer();
String txt = "how are you thank you I hate you";
AnalyzerUtils.displayToken(txt, a1);
AnalyzerUtils.displayToken(txt, a2);
}