lucene学习（一）——DocFieldProcessorPerThread中的processDocument方法

最新推荐文章于 2025-11-20 11:01:01 发布

原创最新推荐文章于 2025-11-20 11:01:01 发布 · 365 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#lucene #FP

lucene 专栏收录该内容

2 篇文章

订阅专栏

本文详细解析了Lucene中DocFieldProcessorPerThread类的processDocument方法实现细节，介绍了如何处理文档字段并将其写入索引的过程。文章深入探讨了字段处理、存储方式以及字段信息的更新机制。

DocFieldProcessorPerThread中的processDocument

/** Process the document. If there is
   *  something for this document to be done in docID order,
   *  you should encapsulate that as a
   *  DocumentsWriter.DocWriter and return it.
   *  DocumentsWriter then calls finish() on this object
   *  when it's its turn. 
   */
public DocumentsWriter.DocWriter processDocument() throws IOException {

		consumer.startDocument();
		fieldsWriter.startDocument();

		final Document doc = docState.doc;// 得到当前的document

		assert docFieldProcessor.docWriter.writer
				.testPoint("DocumentsWriter.ThreadState.init start");

		fieldCount = 0;

		final int thisFieldGen = fieldGen++;

		final List<Fieldable> docFields = doc.getFields();// 得到Document中所有的Field
		final int numDocFields = docFields.size();

		// Absorb any new fields first seen in this document.
		// Also absorb any changes to fields we had already
		// seen before (eg suddenly turning on norms or
		// vectors, etc.):
		// 循环处理docment中的field。
		for (int i = 0; i < numDocFields; i++) {
			Fieldable field = docFields.get(i);
			final String fieldName = field.name();

			// Make sure we have a PerField allocated
			// 得到field的name的hash值，通过hashMask得到hashPos
			final int hashPos = fieldName.hashCode() & hashMask;
			// fieldHash存放所以见过的fields的数组，而这个数据下标是和field的hash相关的
			DocFieldProcessorPerField fp = fieldHash[hashPos];
			// 循环fp,由此说明，fieldHash每一个元素是一个链表，类似于HashMap的存储机制。事实上
			// 最终循环得到field
			while (fp != null && !fp.fieldInfo.name.equals(fieldName))
				fp = fp.next;
			// 如果没有得到field就向fieldHash中添加一个field，否则更新
			if (fp == null) {

				// TODO FI: we need to genericize the "flags" that a
				// field holds, and, how these flags are merged; it
				// needs to be more "pluggable" such that if I want
				// to have a new "thing" my Fields can do, I can
				// easily add it
				FieldInfo fi = fieldInfos.add(fieldName, field.isIndexed(),
						field.isTermVectorStored(), field
								.isStorePositionWithTermVector(), field
								.isStoreOffsetWithTermVector(), field
								.getOmitNorms(), false, field
								.getOmitTermFreqAndPositions());

				fp = new DocFieldProcessorPerField(this, fi);
				fp.next = fieldHash[hashPos];
				fieldHash[hashPos] = fp;
				// 由于fieldHash元素是一个链表，totalFieldCount并不是field的个数，而是当前fieldHash已有元素个数
				totalFieldCount++;

				if (totalFieldCount >= fieldHash.length / 2)// 扩容，确保fieldHash长度是已有元素的两倍（为什么要是两倍呢？）
					rehash();
			} else
				fp.fieldInfo.update(field.isIndexed(), field
						.isTermVectorStored(), field
						.isStorePositionWithTermVector(), field
						.isStoreOffsetWithTermVector(), field.getOmitNorms(),
						false, field.getOmitTermFreqAndPositions());
            //将当前field存入fields
			if (thisFieldGen != fp.lastGen) {

				// First time we're seeing this field for this doc
				fp.fieldCount = 0;

				if (fieldCount == fields.length) {
					final int newSize = fields.length * 2;
					DocFieldProcessorPerField newArray[] = new DocFieldProcessorPerField[newSize];
					System.arraycopy(fields, 0, newArray, 0, fieldCount);
					fields = newArray;
				}

				fields[fieldCount++] = fp;//fields[]存放当前doc
				fp.lastGen = thisFieldGen;
			}

			if (fp.fieldCount == fp.fields.length) {
				Fieldable[] newArray = new Fieldable[fp.fields.length * 2];
				System.arraycopy(fp.fields, 0, newArray, 0, fp.fieldCount);
				fp.fields = newArray;
			}

			fp.fields[fp.fieldCount++] = field;
			if (field.isStored()) {
				fieldsWriter.addField(field, fp.fieldInfo);
			}
		}

		// If we are writing vectors then we must visit
		// fields in sorted order so they are written in
		// sorted order. TODO: we actually only need to
		// sort the subset of fields that have vectors
		// enabled; we could save [small amount of] CPU
		// here.
		quickSort(fields, 0, fieldCount - 1);

		for (int i = 0; i < fieldCount; i++)
			fields[i].consumer.processFields(fields[i].fields,
					fields[i].fieldCount);

		if (docState.maxTermPrefix != null && docState.infoStream != null)
			docState.infoStream
					.println("WARNING: document contains at least one immense term (longer than the max length "
							+ DocumentsWriter.MAX_TERM_LENGTH
							+ "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '"
							+ docState.maxTermPrefix + "...'");

		final DocumentsWriter.DocWriter one = fieldsWriter.finishDocument();
		final DocumentsWriter.DocWriter two = consumer.finishDocument();
		if (one == null) {
			return two;
		} else if (two == null) {
			return one;
		} else {
			PerDoc both = getPerDoc();
			both.docID = docState.docID;
			assert one.docID == docState.docID;
			assert two.docID == docState.docID;
			both.one = one;
			both.two = two;
			return both;
		}
	}