Kafka GroupCoordinator机制(五)：GroupMetadataManager之GroupCoordinator迁移

最新推荐文章于 2024-01-28 23:19:26 发布

原创最新推荐文章于 2024-01-28 23:19:26 发布 · 581 阅读

0 ·

CC 4.0 BY-SA版权

Kafka-0.10 专栏收录该内容

73 篇文章

订阅专栏

本文深入探讨了Kafka在Broker故障时的消费组协调机制，详细解释了leader变更时的GroupCoordinator迁移流程，以及follower状态下的清理过程，揭示了消费组元数据的加载与移除机制。

当leader所在的副本broker出现故障的时候，会发送GroupCoordinator迁移，在handleLeaderAndISRRequest方法中实现

  def handleLeaderAndIsrRequest(request: RequestChannel.Request) {
    // ensureTopicExists is only for client facing requests
    // We can't have the ensureTopicExists check here since the controller sends it as an advisory to all brokers so they
    // stop serving data to clients for the topic being deleted
    val correlationId = request.header.correlationId
    val leaderAndIsrRequest = request.body.asInstanceOf[LeaderAndIsrRequest]

    try {
      def onLeadershipChange(updatedLeaders: Iterable[Partition], updatedFollowers: Iterable[Partition]) {
        // for each new leader or follower, call coordinator to handle consumer group migration.
        // this callback is invoked under the replica state change lock to ensure proper order of
        // leadership changes
        // 调用handleGroupImmigration方法
        updatedLeaders.foreach { partition =>
          if (partition.topic == TopicConstants.GROUP_METADATA_TOPIC_NAME)
            coordinator.handleGroupImmigration(partition.partitionId)
        }
        updatedFollowers.foreach { partition =>
          if (partition.topic == TopicConstants.GROUP_METADATA_TOPIC_NAME)
            coordinator.handleGroupEmigration(partition.partitionId)
        }
      }

      val responseHeader = new ResponseHeader(correlationId)
      val leaderAndIsrResponse =
        if (authorize(request.session, ClusterAction, Resource.ClusterResource)) {
          val result = replicaManager.becomeLeaderOrFollower(correlationId, leaderAndIsrRequest, metadataCache, onLeadershipChange)
          new LeaderAndIsrResponse(result.errorCode, result.responseMap.mapValues(new JShort(_)).asJava)
        } else {
          val result = leaderAndIsrRequest.partitionStates.asScala.keys.map((_, new JShort(Errors.CLUSTER_AUTHORIZATION_FAILED.code))).toMap
          new LeaderAndIsrResponse(Errors.CLUSTER_AUTHORIZATION_FAILED.code, result.asJava)
        }

      requestChannel.sendResponse(new Response(request, new ResponseSend(request.connectionId, responseHeader, leaderAndIsrResponse)))
    } catch {
      case e: KafkaStorageException =>
        fatal("Disk error during leadership change.", e)
        Runtime.getRuntime.halt(1)
    }
  }

当leader成为__CONSUMER_OFFSETS分区的leader时，会回调handleGroupImmigration方法进行加载，此方法会委托loadGroupForPartition方法：

    def loadGroupsAndOffsets() {
      info("Loading offsets and group metadata from " + topicPartition)

      loadingPartitions synchronized {
          // 检测当前partition是否正在加载
        if (loadingPartitions.contains(offsetsPartition)) {
          info("Offset load from %s already in progress.".format(topicPartition))
          return
        } else {
          loadingPartitions.add(offsetsPartition)
        }
      }

      val startMs = time.milliseconds()
      try {
          // 取得这个partition的Log实例
        replicaManager.logManager.getLog(topicPartition) match {
          case Some(log) =>
          // 从Log中第一个LogSegment开始加载
            var currOffset = log.logSegments.head.baseOffset
            // 创建缓冲区
            val buffer = ByteBuffer.allocate(config.loadBufferSize)
            // loop breaks if leader changes at any time during the load, since getHighWatermark is -1
            // 加锁防止与deleteExpiredOffset()并发
            inWriteLock(offsetExpireLock) {
              val loadedGroups = mutable.Map[String, GroupMetadata]()
              val removedGroups = mutable.Set[String]()
                // 读取log结束的位置为HW
              while (currOffset < getHighWatermark(offsetsPartition) && !shuttingDown.get()) {
                buffer.clear()
                // 读取log，返回的是分片的FileMessageSet对象
                val messages = log.read(currOffset, config.loadBufferSize).messageSet.asInstanceOf[FileMessageSet]
                messages.readInto(buffer, 0)
                // 把消息读到内存，形成ByteBufferMessageSet对象。
                val messageSet = new ByteBufferMessageSet(buffer)
                messageSet.foreach { msgAndOffset => // 迭代消息集合
                  require(msgAndOffset.message.key != null, "Offset entry key should not be null")
                  val baseKey = GroupMetadataManager.readMessageKey(msgAndOffset.message.key)

                  if (baseKey.isInstanceOf[OffsetKey]) {
                    // load offset
                    // 读取记录offset信息的消息
                    val key = baseKey.key.asInstanceOf[GroupTopicPartition]
                    if (msgAndOffset.message.payload == null) {
                        //如果是删除标志，就删除offsetsCache中对应的OffsetAndMetadata
                      if (offsetsCache.remove(key) != null)
                        trace("Removed offset for %s due to tombstone entry.".format(key))
                      else
                        trace("Ignoring redundant tombstone for %s.".format(key))
                    } else {
                      // special handling for version 0:
                      // set the expiration time stamp as commit time stamp + server default retention time
                      // 不是删除消息，就解析value
                      val value = GroupMetadataManager.readOffsetMessageValue(msgAndOffset.message.payload)
                      // 向offsetsCache集合中添加对应的OffsetAndMetadata
                      putOffset(key, value.copy (
                        expireTimestamp = {
                          if (value.expireTimestamp == org.apache.kafka.common.requests.OffsetCommitRequest.DEFAULT_TIMESTAMP)
                            value.commitTimestamp + config.offsetsRetentionMs
                          else
                            value.expireTimestamp
                        }
                      ))
                      trace("Loaded offset %s for %s.".format(value, key))
                    }
                  } else {
                    // load group metadata
                    // 读取到记录GorupMetadata信息的消息
                    val groupId = baseKey.key.asInstanceOf[String]
                    val groupMetadata = GroupMetadataManager.readGroupMessageValue(groupId, msgAndOffset.message.payload)
                    // 根据是否为删除标志消息，进行统计
                    if (groupMetadata != null) {
                      trace(s"Loaded group metadata for group ${groupMetadata.groupId} with generation ${groupMetadata.generationId}")
                      removedGroups.remove(groupId)
                      loadedGroups.put(groupId, groupMetadata)
                    } else {
                      loadedGroups.remove(groupId)
                      removedGroups.add(groupId)
                    }
                  }

                  currOffset = msgAndOffset.nextOffset
                }
              }
                // 将需要加载的GroupMetadata信息加载到GroupCahce中
              loadedGroups.values.foreach { group =>
                val currentGroup = addGroup(group)
                if (group != currentGroup)
                  debug(s"Attempt to load group ${group.groupId} from log with generation ${group.generationId} failed " +
                    s"because there is already a cached group with generation ${currentGroup.generationId}")
                else
                  onGroupLoaded(group)
              }
                // 检测需要删除的GroupMetadata信息是否还在GroupsCahce集合中
              removedGroups.foreach { groupId =>
                val group = groupsCache.get(groupId)
                if (group != null)
                  throw new IllegalStateException(s"Unexpected unload of acitve group ${group.groupId} while " +
                    s"loading partition ${topicPartition}")
              }
            }

            if (!shuttingDown.get())
              info("Finished loading offsets from %s in %d milliseconds."
                .format(topicPartition, time.milliseconds() - startMs))
          case None =>
            warn("No log found for " + topicPartition)
        }
      }
      catch {
        case t: Throwable =>
          error("Error in loading offsets from " + topicPartition, t)
      }
      finally {
          // 将当前__CONSUMER_OFFSETS分区从loadingParitition移入ownedPartition集合中。
        loadingPartitions synchronized {
          ownedPartitions.add(offsetsPartition)
          loadingPartitions.remove(offsetsPartition)
        }
      }
    }
  }

当Broker成为__CONSUMER_OFFSETS分区的follower副本会回调handleGroupEmigration方法进行清理工作。

def removeGroupsAndOffsets() {
  var numOffsetsRemoved = 0
  var numGroupsRemoved = 0

  loadingPartitions synchronized {
    // we need to guard the group removal in cache in the loading partition lock
    // to prevent coordinator's check-and-get-group race condition
    // 从ownedPartition集合中将对应的__CONSUMER_OFFSETS分区删除。
    // 标志当前GroupCoordinator不再管管理其对应ConsumerGroup
    ownedPartitions.remove(offsetsPartition)

    // clear the offsets for this partition in the cache

    /**
     * NOTE: we need to put this in the loading partition lock as well to prevent race condition of the leader-is-local check
     * in getOffsets to protects against fetching from an empty/cleared offset cache (i.e., cleared due to a leader->follower
     * transition right after the check and clear the cache), causing offset fetch return empty offsets with NONE error code
     */
     // 遍历offsetCache集合，将分区对应的OffsetAndMetadata全部删除
    offsetsCache.keys.foreach { key =>
      if (partitionFor(key.group) == offsetsPartition) {
        offsetsCache.remove(key)
        numOffsetsRemoved += 1
      }
    }

    // clear the groups for this partition in the cache
    // 遍历GroupCache集合，将分区对应的GroupMetadata全部删除
    for (group <- groupsCache.values) {
      if (partitionFor(group.groupId) == offsetsPartition) {
        onGroupUnloaded(group)
        groupsCache.remove(group.groupId, group)
        numGroupsRemoved += 1
      }
    }
  }

  if (numOffsetsRemoved > 0) info("Removed %d cached offsets for %s on follower transition."
    .format(numOffsetsRemoved, TopicAndPartition(TopicConstants.GROUP_METADATA_TOPIC_NAME, offsetsPartition)))

  if (numGroupsRemoved > 0) info("Removed %d cached groups for %s on follower transition."
    .format(numGroupsRemoved, TopicAndPartition(TopicConstants.GROUP_METADATA_TOPIC_NAME, offsetsPartition)))
}