一、关系
AttributeSource→TokenStream→Tokenizer
↓
TokenFilter
=============================================
Analyzer中的一个抽象方法是
//属性
private final ReuseStrategy reuseStrategy;
========================================
//TokenStreamComponents
//保存了tokenizer和tokeniStream
//也可以设置Reader
protected abstract TokenStreamComponents createComponents(String fieldName,Reader reader);
========================================
//得到TokenStream
public final TokenStream tokenStream(final String fieldName,
final Reader reader) throws IOException {
//ReuseStrategy这个内部类是干吗的?
// private CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<Object>();
//内部抽象类 GlobalReuseStrategy 存放:TokenStreamComponents
// PerFieldReuseStrategy存放 Map<String, TokenStreamComponents>
private final ReuseStrategy reuseStrategy;
TokenStreamComponents components = reuseStrategy.getReusableComponents(fieldName);
final Reader r = initReader(fieldName, reader);
if (components == null) {
components = createComponents(fieldName, r);
reuseStrategy.setReusableComponents(fieldName, components);
} else {
components.setReader(r);
}
return components.getTokenStream();
}
分词输出例子:
Analyzer a=new WhitespaceAnalyzer(Version.LUCENE_43);
TokenStream tokenStream=a.tokenStream("CESHI", new StringReader("I LOVE YOU!"));
CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset(); //java.lang.ArrayIndexOutOfBoundsException
while(tokenStream.incrementToken()){
System.out.print("["+termAttribute.toString()+"
}
二、TokenStream的一些方法和属性
//对于Reader的解析,Token的不断输出
public abstract boolean incrementToken() throws IOException;
public void reset() throws IOException {}
三、Tokenizer的属性和方法
//声明Tokenizer的时候必须有Reader
Reader
四、CharTokenizer
public abstract class CharTokenizer extends Tokenizer {
//tokenizer的属性Reader
public CharTokenizer(Version matchVersion, Reader input) {
super(input);
charUtils = CharacterUtils.getInstance(matchVersion);
}
public CharTokenizer(Version matchVersion, AttributeFactory factory,
Reader input) {
super(factory, input);
charUtils = CharacterUtils.getInstance(matchVersion);
}
// note: bufferIndex is -1 here to best-effort AIOOBE consumers that don't call reset()
//用这些参数的时候必须reset()下 把bufferIndex=0
//因为第一次处理的时候 if (bufferIndex >= dataLen) 不然reader充值不进来?
private int offset = 0, bufferIndex = -1, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255; //允许单词的最大长度
private static final int IO_BUFFER_SIZE = 4096;//一次允许的最大的字符数
//添加一些attribute
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
//CharacterUtils的方法
//codePointAt fill
//通过 Character得到该类提供了几种方法,以确定字符的类别(小写字母,数字,等等),并将字符从大写转换成小写,反之亦然
private final CharacterUtils charUtils;
//CharacterBuffer的属性
//char[] buffer; int offset; int length;
private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
//判断是不是token
protected abstract boolean isTokenChar(int c);
//当看到小写处理的时候是处理掉了 转换为小写了。
protected int normalize(int c) {
return c;
}
@Override
public final boolean incrementToken() throws IOException {
//这个处理是attributeSource处理的具体纳特state没看懂?
clearAttributes();
int length = 0;
int start = -1; // this variable is always initialized
int end = -1;
char[] buffer = termAtt.buffer();
//循环开始??
//offset的明白了一点,但是termAtt怎么得到字符的哪?又是怎么得到小写字符的哪?
while (true) {
//把tokenizer的reader的值赋值到ioBuffer里
if (bufferIndex >= dataLen) {
offset += dataLen;
//tokenizer有reader参数
//实例化analyzer必须实现的方法返回TokenStreamComponents 这个类实现需要tokenizer 属性reader,TokenStream
//把输入流填充到ioBuffer中
if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0) {
break;
} else {
finalOffset = correctOffset(offset);
return false;
}
}
//赋值成功的话datLen会得到数据长度
dataLen = ioBuffer.getLength();
bufferIndex = 0;
}
//赋值成功后判断偏移量的字符 返回给定索引上的 Unicode 代码点
final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
//确定表示指定字符(Unicode 代码点)所需的 char 值的数量 具体也不清楚
final int charCount = Character.charCount(c);
bufferIndex += charCount;
//WhitespaceTokenizer 判断是否是空格
//如果length>0也跳出了循环
if (isTokenChar(c)) { // if it's a token char
//如果length==0 / start
if (length == 0) { // start of token
assert start == -1;
start = offset + bufferIndex - charCount;
end = start;
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
termAtt.setLength(length);
assert start != -1;
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end));
return true;
}
@Override
public final void end() {
offsetAtt.setOffset(finalOffset, finalOffset);
}
//重置属性
@Override
public void reset() throws IOException {
bufferIndex = 0;
offset = 0;
dataLen = 0;
finalOffset = 0;
ioBuffer.reset(); // make sure to reset the IO buffer!!
}
}