package com.scott.analizer;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/**
* 需求:
* 1. Tokenizer: 实现对英文按空白字符进行分词。 需要记录的属性信息有: 词
* 2. TokenFilter: 要进行的处理:转为小写
*/
public class EnglishAnalizerMain {
public static void main(String[] args) {
String text = "The surviving remnants of columns and gates in Beijing's Yuanmingyuan - or Old Summer Palace - will be reinforced to prevent them from collapsing1, Thursday's China Daily reported";
try (Analyzer ana = new EnglishAnalizer();
TokenStream ts = ana.tokenStream("aa", text);) {
CharAttribute ca = ts.getAttribute(CharAttribute.class);
ts.reset();
while (ts.incrementToken()) {
System.out.print(ca.getString() + "|");
}
ts.end();
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 空白字符分词器
*
* @author Scott
*/
public static class EnglishAnalizer extends Analyzer {
// 需要继承 Analyzer, 重写 createComponents 方法
@Override
protected TokenStreamComponents createComponents(String fieldName) {
// 1. 分词
Tokenizer source = new EnglishTokenizer();
// 2. 转小写过滤器
TokenStream filter = new EnglishCaseTokenFilter(source);
/*
如果有多个 Filter 可以在这里增加
FirstUpTokenFilter filter2 = new FirstUpTokenFilter(filter);
return new TokenStreamComponents(source, filter2);
*/
// 3. TokenStreamComponents 提供了实现
return new TokenStreamComponents(source, filter);
}
}
/**
* 分词处理 1. Tokenizer: 实现对英文按空白字符进行分词。 需要记录的属性信息有: 词
*/
public static class EnglishTokenizer extends Tokenizer {
// 需要记录的属性
// 词
CharAttribute charAttr = this.addAttribute(CharAttribute.class);
// 定义一个单词最长为255(没见过单词有这么长的,应该够用,或者定义更长)
char[] buffer = new char[255];
int length = 0;
int c;
@Override
public boolean incrementToken() throws IOException {
// 1.1 必须先清除所有的词项属性
clearAttributes();
// 1.2 Tokenizer分词时,是从字符流中一个一个字符读取,判断是否是空白字符来进行分词
length = 0;
while (true) {
c = this.input.read();
if (c == -1) {
if (length > 0) {
// 复制到charAttr
this.charAttr.setChars(buffer, length);
return true;
} else {
return false;
}
}
// 判断是否为空白字符
if (Character.isWhitespace(c)) {
if (length > 0) {
// 复制到charAttr
this.charAttr.setChars(buffer, length);
return true;
}
}
buffer[length++] = (char) c;
}
}
}
/**
* 2. TokenFilter: 要进行的处理:转为小写
*/
public static class EnglishCaseTokenFilter extends TokenFilter {
CharAttribute charAttr = this.addAttribute(CharAttribute.class);
protected EnglishCaseTokenFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
boolean res = this.input.incrementToken();
if (res) {
char[] chars = charAttr.getChars();
int length = charAttr.getLength();
if (length > 0) {
for (int i = 0; i < length; i++) {
chars[i] = Character.toLowerCase(chars[i]);
}
}
}
return res;
}
}
/**
* CharAttribute 底层会自动在内存找 CharAttributeImpl 实现类并创建对象,所以这个实现类需要写
*/
public interface CharAttribute extends Attribute {
void setChars(char[] buffer, int length);
char[] getChars();
int getLength();
String getString();
}
/**
* CharAttribute charAttr = this.addAttribute(CharAttribute.class);
* 底层就会通过 CharAttribute + Impl 来创建实现类的对象,
*
* 源码:
private Class<? extends AttributeImpl> findImplClass(Class<? extends Attribute> attClass) {
try {
return Class.forName(attClass.getName() + "Impl", true, attClass.getClassLoader()).asSubclass(AttributeImpl.class);
} catch (ClassNotFoundException cnfe) {
throw new IllegalArgumentException("Cannot find implementing class for: " + attClass.getName());
}
}
*
*/
public static class CharAttributeImpl extends AttributeImpl implements CharAttribute {
private char[] chatTerm = new char[255];
private int length = 0;
@Override
public void setChars(char[] buffer, int length) {
this.length = length;
if (length > 0) {
System.arraycopy(buffer, 0, this.chatTerm, 0, length);
}
}
public char[] getChars() {
return this.chatTerm;
}
public int getLength() {
return this.length;
}
@Override
public String getString() {
if (this.length > 0) {
return new String(this.chatTerm, 0, length);
}
return null;
}
@Override
public void clear() {
this.length = 0;
}
@Override
public void reflectWith(AttributeReflector reflector) {
}
@Override
public void copyTo(AttributeImpl target) {
}
}
}
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/**
* 需求:
* 1. Tokenizer: 实现对英文按空白字符进行分词。 需要记录的属性信息有: 词
* 2. TokenFilter: 要进行的处理:转为小写
*/
public class EnglishAnalizerMain {
public static void main(String[] args) {
String text = "The surviving remnants of columns and gates in Beijing's Yuanmingyuan - or Old Summer Palace - will be reinforced to prevent them from collapsing1, Thursday's China Daily reported";
try (Analyzer ana = new EnglishAnalizer();
TokenStream ts = ana.tokenStream("aa", text);) {
CharAttribute ca = ts.getAttribute(CharAttribute.class);
ts.reset();
while (ts.incrementToken()) {
System.out.print(ca.getString() + "|");
}
ts.end();
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 空白字符分词器
*
* @author Scott
*/
public static class EnglishAnalizer extends Analyzer {
// 需要继承 Analyzer, 重写 createComponents 方法
@Override
protected TokenStreamComponents createComponents(String fieldName) {
// 1. 分词
Tokenizer source = new EnglishTokenizer();
// 2. 转小写过滤器
TokenStream filter = new EnglishCaseTokenFilter(source);
/*
如果有多个 Filter 可以在这里增加
FirstUpTokenFilter filter2 = new FirstUpTokenFilter(filter);
return new TokenStreamComponents(source, filter2);
*/
// 3. TokenStreamComponents 提供了实现
return new TokenStreamComponents(source, filter);
}
}
/**
* 分词处理 1. Tokenizer: 实现对英文按空白字符进行分词。 需要记录的属性信息有: 词
*/
public static class EnglishTokenizer extends Tokenizer {
// 需要记录的属性
// 词
CharAttribute charAttr = this.addAttribute(CharAttribute.class);
// 定义一个单词最长为255(没见过单词有这么长的,应该够用,或者定义更长)
char[] buffer = new char[255];
int length = 0;
int c;
@Override
public boolean incrementToken() throws IOException {
// 1.1 必须先清除所有的词项属性
clearAttributes();
// 1.2 Tokenizer分词时,是从字符流中一个一个字符读取,判断是否是空白字符来进行分词
length = 0;
while (true) {
c = this.input.read();
if (c == -1) {
if (length > 0) {
// 复制到charAttr
this.charAttr.setChars(buffer, length);
return true;
} else {
return false;
}
}
// 判断是否为空白字符
if (Character.isWhitespace(c)) {
if (length > 0) {
// 复制到charAttr
this.charAttr.setChars(buffer, length);
return true;
}
}
buffer[length++] = (char) c;
}
}
}
/**
* 2. TokenFilter: 要进行的处理:转为小写
*/
public static class EnglishCaseTokenFilter extends TokenFilter {
CharAttribute charAttr = this.addAttribute(CharAttribute.class);
protected EnglishCaseTokenFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
boolean res = this.input.incrementToken();
if (res) {
char[] chars = charAttr.getChars();
int length = charAttr.getLength();
if (length > 0) {
for (int i = 0; i < length; i++) {
chars[i] = Character.toLowerCase(chars[i]);
}
}
}
return res;
}
}
/**
* CharAttribute 底层会自动在内存找 CharAttributeImpl 实现类并创建对象,所以这个实现类需要写
*/
public interface CharAttribute extends Attribute {
void setChars(char[] buffer, int length);
char[] getChars();
int getLength();
String getString();
}
/**
* CharAttribute charAttr = this.addAttribute(CharAttribute.class);
* 底层就会通过 CharAttribute + Impl 来创建实现类的对象,
*
* 源码:
private Class<? extends AttributeImpl> findImplClass(Class<? extends Attribute> attClass) {
try {
return Class.forName(attClass.getName() + "Impl", true, attClass.getClassLoader()).asSubclass(AttributeImpl.class);
} catch (ClassNotFoundException cnfe) {
throw new IllegalArgumentException("Cannot find implementing class for: " + attClass.getName());
}
}
*
*/
public static class CharAttributeImpl extends AttributeImpl implements CharAttribute {
private char[] chatTerm = new char[255];
private int length = 0;
@Override
public void setChars(char[] buffer, int length) {
this.length = length;
if (length > 0) {
System.arraycopy(buffer, 0, this.chatTerm, 0, length);
}
}
public char[] getChars() {
return this.chatTerm;
}
public int getLength() {
return this.length;
}
@Override
public String getString() {
if (this.length > 0) {
return new String(this.chatTerm, 0, length);
}
return null;
}
@Override
public void clear() {
this.length = 0;
}
@Override
public void reflectWith(AttributeReflector reflector) {
}
@Override
public void copyTo(AttributeImpl target) {
}
}
}
本文介绍了一个基于Lucene的英语分词器实现,该分词器能够按空白字符进行分词,并通过过滤器将所有词汇转换为小写形式。
573

被折叠的 条评论
为什么被折叠?



