Solr.IndexWriter源码分析.4

原创于 2021-12-09 20:38:50 发布 · 182 阅读
CC 4.0 BY-SA版权
文章标签：
2021SC@SDUSC
 /**
   * 关闭所有打开的资源并释放写锁。
   *
   * 如果 {@link IndexWriterConfig#commitOnClose} 为 <code>true</code>，
   * 这将尝试通过写入任何内容来正常关闭
   * 更改，等待任何正在运行的合并、提交和关闭。
   * 在这种情况下，请注意：
   * <ul>
   * <li>如果你调用了prepareCommit但是调用commit失败，这个
   * 方法将抛出 {@code IllegalStateException} 和 {@code IndexWriter}
   * 不会被关闭。</li>
   * <li>如果这个方法抛出任何其他异常，{@code IndexWriter}
   * 将关闭，但更改可能已丢失。</li>
   * </ul>
   *
   * <p>
   * 请注意，这可能是一个代价高昂的
   * 操作，因此，尝试重用单个编写器而不是
   *关闭和打开一个新的。见 {@link #commit()}
   * 关于某些 IO 设备完成的写缓存的警告。
   *
   * <p><b>注意</b>：您必须确保没有其他线程仍在制作
   * 在调用此方法的同时更改。</p>
   */
  @Override
  public void close() throws IOException {
    if (config.getCommitOnClose()) {
      shutdown();
    } else {
      rollback();
    }
  }

 // 如果该线程应该尝试关闭，则返回 true，或者
   // 如果 IndexWriter 现在关闭，则为 false； 别的，
   // 等待另一个线程完成关闭
  synchronized private boolean shouldClose(boolean waitForClose) {
    while (true) {
      if (closed == false) {
        if (closing == false) {
          // We get to close
          closing = true;
          return true;
        } else if (waitForClose == false) {
          return false;
        } else {
        // 另一个线程正在尝试关闭；
           // 等到它以一种方式完成（关闭
           // 成功）或其他（未能关闭）
          doWait();
        }
      } else {
        return false;
      }
    }
  }

  /** Returns the Directory used by this index. */
  public Directory getDirectory() {
  // 返回用户提供的原始目录，解包。
    return directoryOrig;
  }

  @Override
  public InfoStream getInfoStream() {
    return infoStream;
  }

  /** Returns the analyzer used by this index. */
  public Analyzer getAnalyzer() {
    ensureOpen();
    return config.getAnalyzer();
  }

/** 如果 {@link SegmentInfos#getVersion} 低于 {@code newVersion}，则将其更新为此值。
    *
    * @lucene.internal */
  public synchronized void advanceSegmentInfosVersion(long newVersion) {
    ensureOpen();
    if (segmentInfos.getVersion() < newVersion) {
      segmentInfos.setVersion(newVersion);
    }
    changed();
  }
/**
    * 如果该索引有删除（包括
    * 缓冲删除）。 请注意，这将返回 true
    * 如果有缓冲的 Term/Query 删除，即使它
    * 原来那些缓冲的删除不匹配任何
    * 文件。
    */
  public synchronized boolean hasDeletions() {
    ensureOpen();
    if (bufferedUpdatesStream.any()
        || docWriter.anyDeletions()
        || readerPool.anyDeletions()) {
      return true;
    }
    for (final SegmentCommitInfo info : segmentInfos) {
      if (info.hasDeletions()) {
        return true;
      }
    }
    return false;
  }

 /**
   * 将文档添加到此索引。
   *
   * <p> 请注意，如果遇到异常（例如磁盘已满）
   * 那么索引会一致，但是这个文件
   * 可能尚未添加。此外，还有可能
   * 索引将有一个非复合格式的段
   * 即使使用复合文件（当合并有
   * 部分成功）。</p>
   *
   * <p> 此方法会定期刷新待处理的文档
   * 到目录（见<a href="#flush">以上</a>），和
   * 还定期触发索引中的段合并
   * 根据使用中的 {@link MergePolicy}。</p>
   *
   * <p>合并临时占用空间
   * 目录。所需的空间量高达 1 倍
   * 被合并的所有段的大小，当没有时
   * 读者/搜索者对索引开放，最多
   * 合并时所有段的大小的 2 倍
   * 读者/搜索者对索引开放（见
   * {@link #forceMerge(int)} 详情）。的顺序
   * 执行的原始合并操作由
   * 合并政策。
   *
   * <p>注意文档中的每一项都不能再
   * 比 {@link #MAX_TERM_LENGTH} 以字节为单位，否则为
   * 将抛出 IllegalArgumentException。</p>
   *
   * <p>请注意，可能会创建无效的 Unicode
   * 如果 UTF16 代理对格式不正确，则为 java 中的字符串。
   * 在这种情况下，无效字符被静默
   * 替换为 Unicode 替换字符
   * U+FFFD。</p>
   *
   * @return <a href="#sequence_number">序列号</a>
   * 对于这个操作
   *
   * @throws CorruptIndexException 如果索引损坏
   * @throws IOException 如果存在低级 IO 错误
   */
  public long addDocument(Iterable<? extends IndexableField> doc) throws IOException {
    return updateDocument(null, doc);
  }

/**
   * 原子地依次添加一个文档块
   * 分配的文档 ID，以便外部阅读器
   * 将看到所有文件或不看到任何文件。
   *
   * <p><b>警告</b>：索引当前没有记录
   * 哪些文档被添加为一个块。今天这是
   * 很好，因为合并将保留一个块。的顺序
   * 段内的文档将被保留，即使是子级
   * 块内的文档被删除。大多数搜索功能
   *（如结果分组和块加入）要求您
   * 标记文件；当这些文件被删除时
   * 搜索功能将无法按预期工作。显然添加
   * 现有块的文档将需要您重新索引
   * 整个区块。
   *
   * <p>不过，未来 Lucene 有可能
   * 更积极地合并重新排序文档（例如，
   * 也许是为了获得更好的索引压缩），在这种情况下
   * 那时您可能需要完全重新索引您的文档。
   *
   * <p>有关详细信息，请参阅{@link #addDocument(Iterable)}
   * 异常后的 index 和 IndexWriter 状态，以及
   * 刷新/合并临时可用空间要求。</p>
   *
   * <p><b>注意</b>：离线拆分索引的工具
   *（例如，contrib 中的 IndexSplitter）或
   * 重新排序文档（例如，IndexSorter in
   * contrib) 不知道这些原子添加的文档
   * 并且很可能会破坏它们。在您的身边使用此类工具
   * 个人风险！
   *
   * @return <a href="#sequence_number">序列号</a>
   * 对于这个操作
   *
   * @throws CorruptIndexException 如果索引损坏
   * @throws IOException 如果存在低级 IO 错误
   *
   * @lucene.experimental
   */
  public long addDocuments(Iterable<? extends Iterable<? extends IndexableField>> docs) throws IOException {
    return updateDocuments((DocumentsWriterDeleteQueue.Node<?>) null, docs);
  }

 /**
    * 自动删除与提供的匹配的文档
    * delTerm 并按顺序添加一个文档块
    * 分配的文档 ID，以便外部阅读器
    * 将看到所有文件或不看到任何文件。
    *
    * 参见 {@link #addDocuments(Iterable)}。
    *
    * @return <a href="#sequence_number">序列号</a>
    * 对于这个操作
    *
    * @throws CorruptIndexException 如果索引损坏
    * @throws IOException 如果存在低级 IO 错误
    *
    * @lucene.experimental
    */
  public long updateDocuments(Term delTerm, Iterable<? extends Iterable<? extends IndexableField>> docs) throws IOException {
    return updateDocuments(delTerm == null ? null : DocumentsWriterDeleteQueue.newNode(delTerm), docs);
  }

  private long updateDocuments(final DocumentsWriterDeleteQueue.Node<?> delNode, Iterable<? extends Iterable<? extends IndexableField>> docs) throws IOException {
    ensureOpen();
    boolean success = false;
    try {
      final long seqNo = maybeProcessEvents(docWriter.updateDocuments(docs, delNode));
      success = true;
      return seqNo;
    } catch (VirtualMachineError tragedy) {
      tragicEvent(tragedy, "updateDocuments");
      throw tragedy;
    } finally {
      if (success == false) {
        if (infoStream.isEnabled("IW")) {
          infoStream.message("IW", "hit exception updating document");
        }
        maybeCloseOnTragicEvent();
      }
    }
  }

/**
    * 专家：
    * 自动更新匹配提供的文档
    * 具有给定文档值字段的术语
    * 并按顺序添加一个文档块
    * 分配的文档 ID，以便外部阅读器
    * 将看到所有文件或不看到任何文件。
    *
    * 此 API 的一种用途是保留旧版本的
    * 文件而不是替换它们。 现有的
    * 文件可以更新以反映它们不是
    * 更长的电流同时自动添加新文档
    * 同时。
    *
    * 与 {@link #updateDocuments(Term, Iterable)} 相反
    * 此方法不会删除索引中的文档
    * 匹配给定的术语，而是用
    * 给定的文档值字段可用作
    * 软删除机制。
   *
   * See {@link #addDocuments(Iterable)}
   * and {@link #updateDocuments(Term, Iterable)}.
   *
   *
   * @return The <a href="#sequence_number">sequence number</a>
   * for this operation
   *
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   *
   * @lucene.experimental
   */
  public long softUpdateDocuments(Term term, Iterable<? extends Iterable<? extends IndexableField>> docs, Field... softDeletes) throws IOException {
    if (term == null) {
      throw new IllegalArgumentException("term must not be null");
    }
    if (softDeletes == null || softDeletes.length == 0) {
      throw new IllegalArgumentException("at least one soft delete must be present");
    }
    return updateDocuments(DocumentsWriterDeleteQueue.newNode(buildDocValuesUpdate(term, softDeletes)), docs);
  }

/** 专家：尝试按文档 ID 删除，只要
    * 提供的阅读器是近乎实时的阅读器（来自 {@link
    * DirectoryReader#open(IndexWriter)})。 如果
    * 提供的阅读器是从这里获得的 NRT 阅读器
    * writer，它的segment还没有被合并掉，那么
    * 删除成功并且此方法返回一个有效的 (> 0) 序列
    *  数字; 否则，它返回-1，然后调用者必须
    * 分别按 Term 或 Query 删除。
    *
    * <b>注意</b>：此方法只能删除文档
    * 对当前打开的 NRT 阅读器可见。 如果你需要
    * 删除打开 NRT 后索引的文档
    * 读者您必须使用 {@link #deleteDocuments(Term...)})。 */
  public synchronized long tryDeleteDocument(IndexReader readerIn, int docID) throws IOException {
    // NOTE: DON'T use docID inside the closure
    return tryModifyDocument(readerIn, docID, (leafDocId, rld) -> {
      if (rld.delete(leafDocId)) {
        if (isFullyDeleted(rld)) {
          dropDeletedSegment(rld.info);
          checkpoint();
        }
// 必须碰撞 changeCount 所以如果没有其他变化
         // 发生了，我们仍然提交这个更改：
        changed();
      }
    });
  }

 /** 专家：尝试通过文档 ID 更新文档值，只要
   * 提供的阅读器是近乎实时的阅读器（来自 {@link
   * DirectoryReader#open(IndexWriter)})。如果
   * 提供的阅读器是从这里获得的 NRT 阅读器
   * writer，它的segment还没有被合并掉，那么
   * 更新成功并且此方法返回一个有效的 (> 0) 序列
   *  数字;否则，它返回-1，然后调用者必须
   * 要么重试更新并再次解析文档。
   * 如果文档值字段数据为 <code>null</code> 现有
   * 值从与该术语匹配的所有文档中删除。这个可以用
   * 取消删除软删除的文档，因为此方法将应用
   * 即使文档被标记为已删除，字段也会更新。
   *
   * <b>注意</b>：此方法只能更新文档
   * 对当前打开的 NRT 阅读器可见。如果你需要
   * 在打开 NRT 后更新索引的文档
   * 读者您必须使用 {@link #updateDocValues(Term, Field...)}。 */
  public synchronized long tryUpdateDocValue(IndexReader readerIn, int docID, Field... fields) throws IOException {
    // NOTE: DON'T use docID inside the closure
    final DocValuesUpdate[] dvUpdates = buildDocValuesUpdate(null, fields);
    return tryModifyDocument(readerIn, docID, (leafDocId, rld) -> {
      long nextGen = bufferedUpdatesStream.getNextGen();
      try {
        Map<String, DocValuesFieldUpdates> fieldUpdatesMap = new HashMap<>();
        for (DocValuesUpdate update : dvUpdates) {
          DocValuesFieldUpdates docValuesFieldUpdates = fieldUpdatesMap.computeIfAbsent(update.field, k -> {
            switch (update.type) {
              case NUMERIC:
                return new NumericDocValuesFieldUpdates(nextGen, k, rld.info.info.maxDoc());
              case BINARY:
                return new BinaryDocValuesFieldUpdates(nextGen, k, rld.info.info.maxDoc());
              default:
                throw new AssertionError("type: " + update.type + " is not supported");
            }
          });
          if (update.hasValue()) {
            switch (update.type) {
              case NUMERIC:
                docValuesFieldUpdates.add(leafDocId, ((NumericDocValuesUpdate) update).getValue());
                break;
              case BINARY:
                docValuesFieldUpdates.add(leafDocId, ((BinaryDocValuesUpdate) update).getValue());
                break;
              default:
                throw new AssertionError("type: " + update.type + " is not supported");
            }
          } else {
            docValuesFieldUpdates.reset(leafDocId);
          }
        }
        for (DocValuesFieldUpdates updates : fieldUpdatesMap.values()) {
          updates.finish();
          rld.addDVUpdate(updates);
        }
      } finally {
        bufferedUpdatesStream.finishedSegment(nextGen);
      }
    // 必须碰撞 changeCount 所以如果没有其他变化
       // 发生了，我们仍然提交这个更改：
      changed();
    });
  }

  @FunctionalInterface
  private interface DocModifier {
    void run(int docId, ReadersAndUpdates readersAndUpdates) throws IOException;
  }

  private synchronized long tryModifyDocument(IndexReader readerIn, int docID, DocModifier toApply) throws IOException {
    final LeafReader reader;
    if (readerIn instanceof LeafReader) {
      // Reader is already atomic: use the incoming docID:
      reader = (LeafReader) readerIn;
    } else {
      // Composite reader: lookup sub-reader and re-base docID:
      List<LeafReaderContext> leaves = readerIn.leaves();
      int subIndex = ReaderUtil.subIndex(docID, leaves);
      reader = leaves.get(subIndex).reader();
      docID -= leaves.get(subIndex).docBase;
      assert docID >= 0;
      assert docID < reader.maxDoc();
    }

    if (!(reader instanceof SegmentReader)) {
      throw new IllegalArgumentException("the reader must be a SegmentReader or composite reader containing only SegmentReaders");
    }

    final SegmentCommitInfo info = ((SegmentReader) reader).getOriginalSegmentInfo();

// TODO：这是一个缓慢的线性搜索，但是，数量
     // 除非有东西，否则应该包含段
     // 索引严重错误，所以应该是次要的
     // 成本：

    if (segmentInfos.indexOf(info) != -1) {
      ReadersAndUpdates rld = getPooledInstance(info, false);
      if (rld != null) {
        synchronized(bufferedUpdatesStream) {
          toApply.run(docID, rld);
          return docWriter.getNextSequenceNumber();
        }
      }
    }
    return -1;
  }

/** 删除包含 100% 已删除文档的段。 */
  private synchronized void dropDeletedSegment(SegmentCommitInfo info) throws IOException {
    // If a merge has already registered for this
    // segment, we leave it in the readerPool; the
    // merge will skip merging it and will then drop
    // it once it's done:
    if (mergingSegments.contains(info) == false) {
      // it's possible that we invoke this method more than once for the same SCI
      // we must only remove the docs once!
      boolean dropPendingDocs = segmentInfos.remove(info);
      try {
      // 这是偷偷摸摸的 - 我们可能会在删除阅读器时遇到异常，但我们已经
         // 删除了 segmentInfo 的分段，因此我们丢失了 pendingDocs 更新。
         // 因此我们在 finally 块中执行 adjustPendingNumDocs 来解决这个问题。
        dropPendingDocs |= readerPool.drop(info);
      } finally {
        if (dropPendingDocs) {
          adjustPendingNumDocs(-info.info.maxDoc());
        }
      }
    }
  }

 /**
    * 删除包含任何
    * 条款。 所有给定的删除都以原子方式应用和刷新
    * 同时。
    *
    * @return <a href="#sequence_number">序列号</a>
    * 对于这个操作
    *
    * @param terms 用于识别文档的术语数组
    * 待删除
    * @throws CorruptIndexException 如果索引损坏
    * @throws IOException 如果存在低级 IO 错误
    */
  public long deleteDocuments(Term... terms) throws IOException {
    ensureOpen();
    try {
      return maybeProcessEvents(docWriter.deleteTerms(terms));
    } catch (VirtualMachineError tragedy) {
      tragicEvent(tragedy, "deleteDocuments(Term..)");
      throw tragedy;
    }
  }

 /**
    * 删除与任何提供的查询匹配的文档。
    * 所有给定的删除都同时以原子方式应用和刷新。
   *
   * @return The <a href="#sequence_number">sequence number</a>
   * for this operation
   *
   * @param queries array of queries to identify the documents
   * to be deleted
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   */
  public long deleteDocuments(Query... queries) throws IOException {
    ensureOpen();

    // LUCENE-6379: Specialize MatchAllDocsQuery
    for(Query query : queries) {
      if (query.getClass() == MatchAllDocsQuery.class) {
        return deleteAll();
      }
    }

    try {
      return maybeProcessEvents(docWriter.deleteQueries(queries));
    } catch (VirtualMachineError tragedy) {
      tragicEvent(tragedy, "deleteDocuments(Query..)");
      throw tragedy;
    }
  }

 /**
    * 通过首先删除文档来更新文档
    * 包含 <code>term</code> 然后添加新的
    * 文档。 删除然后添加是原子的，如所见
    * 由同一索引上的读者（刷新可能仅在
    * 添加）。
    *
    * @return <a href="#sequence_number">序列号</a>
    * 对于这个操作
    *
   * @param term the term to identify the document(s) to be
   * deleted
   * @param doc the document to be added
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   */
  public long updateDocument(Term term, Iterable<? extends IndexableField> doc) throws IOException {
    return updateDocuments(term == null ? null : DocumentsWriterDeleteQueue.newNode(term), Collections.singletonList(doc));
  }

/**
    * 专家：
    * 通过首先更新文档来更新文档
    * 包含带有给定文档值字段的 <code>term</code>
    * 然后添加新文档。 文档值更新和
    * 然后添加是原子的，就像读者在同一索引上看到的那样
    *（只有在添加后才会刷新）。
    *
    * 此 API 的一种用途是保留旧版本的
    * 文件而不是替换它们。 现有的
    * 文件可以更新以反映它们不是
    * 更长的电流同时自动添加新文档
    * 同时。
    *
    * 与 {@link #updateDocument(Term, Iterable)} 相反
    * 此方法不会删除索引中的文档
    * 匹配给定的术语，而是用
    * 给定的文档值字段可用作
    * 软删除机制。
   *
   * See {@link #addDocuments(Iterable)}
   * and {@link #updateDocuments(Term, Iterable)}.
   *
   *
   * @return The <a href="#sequence_number">sequence number</a>
   * for this operation
   *
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   *
   * @lucene.experimental
   */
  public long softUpdateDocument(Term term, Iterable<? extends IndexableField> doc, Field... softDeletes) throws IOException {
    if (term == null) {
      throw new IllegalArgumentException("term must not be null");
    }
    if (softDeletes == null || softDeletes.length == 0) {
      throw new IllegalArgumentException("at least one soft delete must be present");
    }
    return updateDocuments(DocumentsWriterDeleteQueue.newNode(buildDocValuesUpdate(term, softDeletes)), Collections.singletonList(doc));
  }


 /**
    * 将文档的 <code>field</code> 的 {@link NumericDocValues} 更新为
    * 给定 <code>value</code>。 您只能更新已存在的字段
    * 索引，不通过此方法添加新字段。
   * 
   * @param term
   *          the term to identify the document(s) to be updated
   * @param field
   *          field name of the {@link NumericDocValues} field
   * @param value
   *          new value for the field
   *
   * @return The <a href="#sequence_number">sequence number</a>
   * for this operation
   *
   * @throws CorruptIndexException
   *           if the index is corrupt
   * @throws IOException
   *           if there is a low-level IO error
   */
  public long updateNumericDocValue(Term term, String field, long value) throws IOException {
    ensureOpen();
    if (!globalFieldNumberMap.contains(field, DocValuesType.NUMERIC)) {
      throw new IllegalArgumentException("can only update existing numeric-docvalues fields!");
    }
    if (config.getIndexSortFields().contains(field)) {
      throw new IllegalArgumentException("cannot update docvalues field involved in the index sort, field=" + field + ", sort=" + config.getIndexSort());
    }
    try {
      return maybeProcessEvents(docWriter.updateDocValues(new NumericDocValuesUpdate(term, field, value)));
    } catch (VirtualMachineError tragedy) {
      tragicEvent(tragedy, "updateNumericDocValue");
      throw tragedy;
    }
  }

/**
    * 将文档的 <code>field</code> 的 {@link BinaryDocValues} 更新为
    * 给定 <code>value</code>。 您只能更新已存在的字段
    * 索引，不通过此方法添加新字段。
   * 
   * <p>
   * <b>NOTE:</b> this method currently replaces the existing value of all
   * affected documents with the new value.
   * 
   * @param term
   *          the term to identify the document(s) to be updated
   * @param field
   *          field name of the {@link BinaryDocValues} field
   * @param value
   *          new value for the field
   *
   * @return The <a href="#sequence_number">sequence number</a>
   * for this operation
   *
   * @throws CorruptIndexException
   *           if the index is corrupt
   * @throws IOException
   *           if there is a low-level IO error
   */
  public long updateBinaryDocValue(Term term, String field, BytesRef value) throws IOException {
    ensureOpen();
    if (value == null) {
      throw new IllegalArgumentException("cannot update a field to a null value: " + field);
    }
    if (!globalFieldNumberMap.contains(field, DocValuesType.BINARY)) {
      throw new IllegalArgumentException("can only update existing binary-docvalues fields!");
    }
    try {
      return maybeProcessEvents(docWriter.updateDocValues(new BinaryDocValuesUpdate(term, field, value)));
    } catch (VirtualMachineError tragedy) {
      tragicEvent(tragedy, "updateBinaryDocValue");
      throw tragedy;
    }
  }