lucene 入门小结二 -- 索引的建立(1)

1 lucene 的document ,是一种逻辑文件,lucene 本身无法对特理文件建立索引,只能识别并处理document类型的文件。 document只负责收集数据源。


2 document 可以增加多种 field ,field 可以理解为字段,field 属性有以下几种:
2.1 是否存储:指数据是否要完整的存储在索引中。比较适合于文本较为简短的数据源 中,如果是大段存储在索引中,会使索引容量过大。
2.2 是否索引:这个很简单,意思是,数据是事要在用户检索时被检索。
2.3 是否分词:要不要对文本进行规则性的切分,以便进行倒排。

3 看看document 中的内部实现:


package org.apache.lucene.document;

import java.util.*; // for javadoc
import org.apache.lucene.search.ScoreDoc; // for javadoc
import org.apache.lucene.search.Searcher; // for javadoc
import org.apache.lucene.index.IndexReader; // for javadoc

public final class Document implements java.io.Serializable {
List<Fieldable> fields = new ArrayList<Fieldable>();
private float boost = 1.0f;

/** Constructs a new document with no fields. */
public Document() {
}

public void setBoost(float boost) {
this.boost = boost;
}

public float getBoost() {
return boost;
}

public final void add(Fieldable field) {
fields.add(field);
}

public final void removeField(String name) {
Iterator<Fieldable> it = fields.iterator();
while (it.hasNext()) {
Fieldable field = it.next();
if (field.name().equals(name)) {
it.remove();
return;
}
}
}

public final void removeFields(String name) {
Iterator<Fieldable> it = fields.iterator();
while (it.hasNext()) {
Fieldable field = it.next();
if (field.name().equals(name)) {
it.remove();
}
}
}

public final Field getField(String name) {
return (Field) getFieldable(name);
}

public Fieldable getFieldable(String name) {
for (Fieldable field : fields) {
if (field.name().equals(name))
return field;
}
return null;
}

public final String get(String name) {
for (Fieldable field : fields) {
if (field.name().equals(name) && (!field.isBinary()))
return field.stringValue();
}
return null;
}

public final List<Fieldable> getFields() {
return fields;
}

private final static Field[] NO_FIELDS = new Field[0];

public final Field[] getFields(String name) {
List<Field> result = new ArrayList<Field>();
for (Fieldable field : fields) {
if (field.name().equals(name)) {
result.add((Field) field);
}
}

if (result.size() == 0)
return NO_FIELDS;

return result.toArray(new Field[result.size()]);
}

private final static Fieldable[] NO_FIELDABLES = new Fieldable[0];

public Fieldable[] getFieldables(String name) {
List<Fieldable> result = new ArrayList<Fieldable>();
for (Fieldable field : fields) {
if (field.name().equals(name)) {
result.add(field);
}
}

if (result.size() == 0)
return NO_FIELDABLES;

return result.toArray(new Fieldable[result.size()]);
}

private final static String[] NO_STRINGS = new String[0];

public final String[] getValues(String name) {
List<String> result = new ArrayList<String>();
for (Fieldable field : fields) {
if (field.name().equals(name) && (!field.isBinary()))
result.add(field.stringValue());
}

if (result.size() == 0)
return NO_STRINGS;

return result.toArray(new String[result.size()]);
}

private final static byte[][] NO_BYTES = new byte[0][];

public final byte[][] getBinaryValues(String name) {
List<byte[]> result = new ArrayList<byte[]>();
for (Fieldable field : fields) {
if (field.name().equals(name) && (field.isBinary()))
result.add(field.getBinaryValue());
}

if (result.size() == 0)
return NO_BYTES;

return result.toArray(new byte[result.size()][]);
}

public final byte[] getBinaryValue(String name) {
for (Fieldable field : fields) {
if (field.name().equals(name) && (field.isBinary()))
return field.getBinaryValue();
}
return null;
}

@Override
public final String toString() {
StringBuilder buffer = new StringBuilder();
buffer.append("Document<");
for (int i = 0; i < fields.size(); i++) {
Fieldable field = fields.get(i);
buffer.append(field.toString());
if (i != fields.size() - 1)
buffer.append(" ");
}
buffer.append(">");
return buffer.toString();
}
}




主要是对field信息进行记录和管理的作用,以便操作所有的filed信息。主要包括:增加,删除,查找等。

field 的内部实现:

package org.apache.lucene.document;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexWriter; // for javadoc
import org.apache.lucene.util.StringHelper;

import java.io.Reader;
import java.io.Serializable;

public final class Field extends AbstractField implements Fieldable, Serializable {

/** Specifies whether and how a field should be stored. */
public static enum Store {

YES {
@Override
public boolean isStored() {
return true;
}
},

/** Do not store the field value in the index. */
NO {
@Override
public boolean isStored() {
return false;
}
};

public abstract boolean isStored();
}

/** Specifies whether and how a field should be indexed. */
public static enum Index {

NO {
@Override
public boolean isIndexed() {
return false;
}

@Override
public boolean isAnalyzed() {
return false;
}

@Override
public boolean omitNorms() {
return true;
}
},

ANALYZED {
@Override
public boolean isIndexed() {
return true;
}

@Override
public boolean isAnalyzed() {
return true;
}

@Override
public boolean omitNorms() {
return false;
}
},

NOT_ANALYZED {
@Override
public boolean isIndexed() {
return true;
}

@Override
public boolean isAnalyzed() {
return false;
}

@Override
public boolean omitNorms() {
return false;
}
},

NOT_ANALYZED_NO_NORMS {
@Override
public boolean isIndexed() {
return true;
}

@Override
public boolean isAnalyzed() {
return false;
}

@Override
public boolean omitNorms() {
return true;
}
},

ANALYZED_NO_NORMS {
@Override
public boolean isIndexed() {
return true;
}

@Override
public boolean isAnalyzed() {
return true;
}

@Override
public boolean omitNorms() {
return true;
}
};

/** Get the best representation of the index given the flags. */
public static Index toIndex(boolean indexed, boolean analyzed) {
return toIndex(indexed, analyzed, false);
}

/** Expert: Get the best representation of the index given the flags. */
public static Index toIndex(boolean indexed, boolean analyzed, boolean omitNorms) {

// If it is not indexed nothing else matters
if (!indexed) {
return Index.NO;
}

// typical, non-expert
if (!omitNorms) {
if (analyzed) {
return Index.ANALYZED;
}
return Index.NOT_ANALYZED;
}

// Expert: Norms omitted
if (analyzed) {
return Index.ANALYZED_NO_NORMS;
}
return Index.NOT_ANALYZED_NO_NORMS;
}

public abstract boolean isIndexed();

public abstract boolean isAnalyzed();

public abstract boolean omitNorms();
}

/** Specifies whether and how a field should have term vectors. */
public static enum TermVector {

/**
* Do not store term vectors.
*/
NO {
@Override
public boolean isStored() {
return false;
}

@Override
public boolean withPositions() {
return false;
}

@Override
public boolean withOffsets() {
return false;
}
},

YES {
@Override
public boolean isStored() {
return true;
}

@Override
public boolean withPositions() {
return false;
}

@Override
public boolean withOffsets() {
return false;
}
},

WITH_POSITIONS {
@Override
public boolean isStored() {
return true;
}

@Override
public boolean withPositions() {
return true;
}

@Override
public boolean withOffsets() {
return false;
}
},

WITH_OFFSETS {
@Override
public boolean isStored() {
return true;
}

@Override
public boolean withPositions() {
return false;
}

@Override
public boolean withOffsets() {
return true;
}
},

WITH_POSITIONS_OFFSETS {
@Override
public boolean isStored() {
return true;
}

@Override
public boolean withPositions() {
return true;
}

@Override
public boolean withOffsets() {
return true;
}
};

/** Get the best representation of a TermVector given the flags. */
public static TermVector toTermVector(boolean stored, boolean withOffsets, boolean withPositions) {

// If it is not stored, nothing else matters.
if (!stored) {
return TermVector.NO;
}

if (withOffsets) {
if (withPositions) {
return Field.TermVector.WITH_POSITIONS_OFFSETS;
}
return Field.TermVector.WITH_OFFSETS;
}

if (withPositions) {
return Field.TermVector.WITH_POSITIONS;
}
return Field.TermVector.YES;
}

public abstract boolean isStored();

public abstract boolean withPositions();

public abstract boolean withOffsets();
}

public String stringValue() {
return fieldsData instanceof String ? (String) fieldsData : null;
}

public Reader readerValue() {
return fieldsData instanceof Reader ? (Reader) fieldsData : null;
}

public TokenStream tokenStreamValue() {
return tokenStream;
}

public void setValue(String value) {
if (isBinary) {
throw new IllegalArgumentException("cannot set a String value on a binary field");
}
fieldsData = value;
}

/**
* Expert: change the value of this field. See <a
* href="#setValue(java.lang.String)">setValue(String)</a>.
*/
public void setValue(Reader value) {
if (isBinary) {
throw new IllegalArgumentException("cannot set a Reader value on a binary field");
}
if (isStored) {
throw new IllegalArgumentException("cannot set a Reader value on a stored field");
}
fieldsData = value;
}

/**
* Expert: change the value of this field. See <a
* href="#setValue(java.lang.String)">setValue(String)</a>.
*/
public void setValue(byte[] value) {
if (!isBinary) {
throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
}
fieldsData = value;
binaryLength = value.length;
binaryOffset = 0;
}

/**
* Expert: change the value of this field. See <a
* href="#setValue(java.lang.String)">setValue(String)</a>.
*/
public void setValue(byte[] value, int offset, int length) {
if (!isBinary) {
throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
}
fieldsData = value;
binaryLength = length;
binaryOffset = offset;
}

/**
* Expert: sets the token stream to be used for indexing and causes
* isIndexed() and isTokenized() to return true. May be combined with stored
* values from stringValue() or getBinaryValue()
*/
public void setTokenStream(TokenStream tokenStream) {
this.isIndexed = true;
this.isTokenized = true;
this.tokenStream = tokenStream;
}

public Field(String name, String value, Store store, Index index) {
this(name, value, store, index, TermVector.NO);
}

public Field(String name, String value, Store store, Index index, TermVector termVector) {
this(name, true, value, store, index, termVector);
}

public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
if (value == null)
throw new NullPointerException("value cannot be null");
if (name.length() == 0 && value.length() == 0)
throw new IllegalArgumentException("name and value cannot both be empty");
if (index == Index.NO && store == Store.NO)
throw new IllegalArgumentException("it doesn't make sense to have a field that " + "is neither indexed nor stored");
if (index == Index.NO && termVector != TermVector.NO)
throw new IllegalArgumentException("cannot store term vector information " + "for a field that is not indexed");

if (internName) // field names are optionally interned
name = StringHelper.intern(name);

this.name = name;

this.fieldsData = value;

this.isStored = store.isStored();

this.isIndexed = index.isIndexed();
this.isTokenized = index.isAnalyzed();
this.omitNorms = index.omitNorms();
if (index == Index.NO) {
this.omitTermFreqAndPositions = false;
}

this.isBinary = false;

setStoreTermVector(termVector);
}

public Field(String name, Reader reader) {
this(name, reader, TermVector.NO);
}

public Field(String name, Reader reader, TermVector termVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
if (reader == null)
throw new NullPointerException("reader cannot be null");

this.name = StringHelper.intern(name); // field names are interned
this.fieldsData = reader;

this.isStored = false;

this.isIndexed = true;
this.isTokenized = true;

this.isBinary = false;

setStoreTermVector(termVector);
}

public Field(String name, TokenStream tokenStream) {
this(name, tokenStream, TermVector.NO);
}

public Field(String name, TokenStream tokenStream, TermVector termVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
if (tokenStream == null)
throw new NullPointerException("tokenStream cannot be null");

this.name = StringHelper.intern(name); // field names are interned
this.fieldsData = null;
this.tokenStream = tokenStream;

this.isStored = false;

this.isIndexed = true;
this.isTokenized = true;

this.isBinary = false;

setStoreTermVector(termVector);
}

public Field(String name, byte[] value, Store store) {
this(name, value, 0, value.length, store);
}

public Field(String name, byte[] value, int offset, int length, Store store) {

if (name == null)
throw new IllegalArgumentException("name cannot be null");
if (value == null)
throw new IllegalArgumentException("value cannot be null");

this.name = StringHelper.intern(name); // field names are interned
fieldsData = value;

if (store == Store.NO)
throw new IllegalArgumentException("binary values can't be unstored");

isStored = store.isStored();
isIndexed = false;
isTokenized = false;
omitTermFreqAndPositions = false;
omitNorms = true;

isBinary = true;
binaryLength = length;
binaryOffset = offset;

setStoreTermVector(TermVector.NO);
}
}


通过两个枚举类 store 和 index 来描述了field 的所有属性,
Store枚举类: 是否需要存储:Store.YES ,NO
Index 枚举类: NO:
ANALYZED:
NOT_ANALYZED:
NOT_ANALYZED_NO_NORMS:
ANALYZED_NO_NORMS :

Field 主要的构造方法如下:

  public Field(String name, String value, Store store, Index index) 
public Field(String name, String value, Store store, Index index, TermVector termVector)
public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector)
public Field(String name, Reader reader)
public Field(String name, Reader reader, TermVector termVector)
public Field(String name, TokenStream tokenStream)
public Field(String name, TokenStream tokenStream, TermVector termVector)
public Field(String name, byte[] value, Store store)
public Field(String name, byte[] value, int offset, int length, Store store)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值