LUCENE-第二课分词(自己写的分词器)

本文介绍了一个基于Lucene的英语分词器实现,该分词器能够按空白字符进行分词,并通过过滤器将所有词汇转换为小写形式。
package com.scott.analizer;


import java.io.IOException;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;


/**
 * 需求: 
 * 1. Tokenizer: 实现对英文按空白字符进行分词。 需要记录的属性信息有: 词 
 * 2. TokenFilter: 要进行的处理:转为小写
 */
public class EnglishAnalizerMain {
public static void main(String[] args) {
String text = "The surviving remnants of columns and gates in Beijing's Yuanmingyuan - or Old Summer Palace - will be reinforced to prevent them from collapsing1, Thursday's China Daily reported";
try (Analyzer ana = new EnglishAnalizer();
TokenStream ts = ana.tokenStream("aa", text);) {
CharAttribute ca = ts.getAttribute(CharAttribute.class);
ts.reset();
while (ts.incrementToken()) {
System.out.print(ca.getString() + "|");
}
ts.end();
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* 空白字符分词器

* @author Scott
*/
public static class EnglishAnalizer extends Analyzer {
// 需要继承 Analyzer, 重写 createComponents 方法
@Override
protected TokenStreamComponents createComponents(String fieldName) {
// 1. 分词
Tokenizer source = new EnglishTokenizer();
// 2. 转小写过滤器
TokenStream filter = new EnglishCaseTokenFilter(source);
/* 
如果有多个 Filter 可以在这里增加
FirstUpTokenFilter filter2 = new FirstUpTokenFilter(filter);
return new TokenStreamComponents(source, filter2);
*/ 
// 3. TokenStreamComponents 提供了实现
return new TokenStreamComponents(source, filter);
}
}


/**
* 分词处理 1. Tokenizer: 实现对英文按空白字符进行分词。 需要记录的属性信息有: 词
*/
public static class EnglishTokenizer extends Tokenizer {
// 需要记录的属性
// 词
CharAttribute charAttr = this.addAttribute(CharAttribute.class);


// 定义一个单词最长为255(没见过单词有这么长的,应该够用,或者定义更长)
char[] buffer = new char[255];
int length = 0;
int c;


@Override
public boolean incrementToken() throws IOException {
// 1.1 必须先清除所有的词项属性
clearAttributes();
// 1.2 Tokenizer分词时,是从字符流中一个一个字符读取,判断是否是空白字符来进行分词
length = 0;
while (true) {
c = this.input.read();
if (c == -1) {
if (length > 0) {
// 复制到charAttr
this.charAttr.setChars(buffer, length);
return true;
} else {
return false;
}
}
// 判断是否为空白字符
if (Character.isWhitespace(c)) {
if (length > 0) {
// 复制到charAttr
this.charAttr.setChars(buffer, length);
return true;
}
}
buffer[length++] = (char) c;
}
}
}


/**
* 2. TokenFilter: 要进行的处理:转为小写
*/
public static class EnglishCaseTokenFilter extends TokenFilter {
CharAttribute charAttr = this.addAttribute(CharAttribute.class);

protected EnglishCaseTokenFilter(TokenStream input) {
super(input);
}


@Override
public boolean incrementToken() throws IOException {
boolean res = this.input.incrementToken();
if (res) {
char[] chars = charAttr.getChars();
int length = charAttr.getLength();
if (length > 0) {
for (int i = 0; i < length; i++) {
chars[i] = Character.toLowerCase(chars[i]);
}
}
}
return res;
}
}

/**
* CharAttribute 底层会自动在内存找 CharAttributeImpl 实现类并创建对象,所以这个实现类需要写
*/
public interface CharAttribute extends Attribute {
void setChars(char[] buffer, int length);
char[] getChars();
int getLength();
String getString();
}

/**
* CharAttribute charAttr = this.addAttribute(CharAttribute.class);
* 底层就会通过 CharAttribute + Impl 来创建实现类的对象,

* 源码: 
private Class<? extends AttributeImpl> findImplClass(Class<? extends Attribute> attClass) {
      try {
        return Class.forName(attClass.getName() + "Impl", true, attClass.getClassLoader()).asSubclass(AttributeImpl.class);
      } catch (ClassNotFoundException cnfe) {
        throw new IllegalArgumentException("Cannot find implementing class for: " + attClass.getName());
      }      
    }

*/
public static class CharAttributeImpl extends AttributeImpl implements CharAttribute {
private char[] chatTerm = new char[255];
private int length = 0;


@Override
public void setChars(char[] buffer, int length) {
this.length = length;
if (length > 0) {
System.arraycopy(buffer, 0, this.chatTerm, 0, length);
}
}


public char[] getChars() {
return this.chatTerm;
}


public int getLength() {
return this.length;
}


@Override
public String getString() {
if (this.length > 0) {
return new String(this.chatTerm, 0, length);
}
return null;
}


@Override
public void clear() {
this.length = 0;
}


@Override
public void reflectWith(AttributeReflector reflector) {
}


@Override
public void copyTo(AttributeImpl target) {
}
}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值