import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import com.lietu.seg.result.CnTokenizer;
//需要lucene-core-2.3.2.jar和猎兔分词seg.jar和字典目录dic
public class MyCnAnalyzerTest {
static class MyCnAnalyzer extends Analyzer
{
public MyCnAnalyzer()
{
CnTokenizer.makeTag=true;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new CnTokenizer(reader);
result = new MySingleFilter(result);
return result;
}
}
static class MySingleFilter extends TokenFilter
{
private Token buff=null;
private int offset=0;
//一元分词
private static String tokenType = "1word";
public MySingleFilter(TokenStream in) {
super(in);
}
@Override
public Token next() throws IOException {
if (buff!=null)
{
if(offset == buff.termText().length())
{
Token buff2 = buff;
buff = null;
return buff2;
}
Token buff2 = new Token(buff.termText().substring(offset,1+offset),
buff.startOffset()+offset,
buff.startOffset()+offset+1,tokenType);
buff2.setPositionIncrement(0);//增量为0,当增量是0的时候就是说明这个字和词是同义词,查找字的时候可以找到结果,查找词的时候同样可以找到结果。
++offset;
return buff2;
}
Token t = input.next();
if (t == null)
return null;
if (t.termText().length()>1)
{
buff = t;//这个buff影响了全局变量private Token buff=null;那么就会影响到上一个if中的buff
offset = 0;
Token buff2 = new Token(buff.termText().substring(offset,1+offset),
buff.startOffset()+offset,
buff.startOffset()+offset+1,tokenType);
buff2.setPositionIncrement(0);
++offset;
return buff2;
}
return t;
}
}
public static void main(String[] args) throws IOException {
MyCnAnalyzer cna = new MyCnAnalyzer();
String input = "由广东省公安厅和广东卫视合办的《南粤警视》栏目,广西南宁,北部湾";
// 需要注意的是在调用CnAnalyzer的tokenStream方法的时候在内存中就就初始化了很多相应的变量和类。
TokenStream ts = cna.tokenStream("asd", new StringReader(input));
for (Token t = ts.next(); t != null; t = ts.next()) {
System.out.println(t.termText() + " " + t.startOffset() + " "
+ t.endOffset() + " " + t.type() + " "
+ t.getPositionIncrement());
}
}
}
搜索引擎之猎兔分词实例
最新推荐文章于 2022-07-11 06:55:56 发布