当leader所在的副本broker出现故障的时候,会发送GroupCoordinator迁移,在handleLeaderAndISRRequest方法中实现
def handleLeaderAndIsrRequest(request: RequestChannel.Request) {
// ensureTopicExists is only for client facing requests
// We can't have the ensureTopicExists check here since the controller sends it as an advisory to all brokers so they
// stop serving data to clients for the topic being deleted
val correlationId = request.header.correlationId
val leaderAndIsrRequest = request.body.asInstanceOf[LeaderAndIsrRequest]
try {
def onLeadershipChange(updatedLeaders: Iterable[Partition], updatedFollowers: Iterable[Partition]) {
// for each new leader or follower, call coordinator to handle consumer group migration.
// this callback is invoked under the replica state change lock to ensure proper order of
// leadership changes
// 调用handleGroupImmigration方法
updatedLeaders.foreach { partition =>
if (partition.topic == TopicConstants.GROUP_METADATA_TOPIC_NAME)
coordinator.handleGroupImmigration(partition.partitionId)
}
updatedFollowers.foreach { partition =>
if (partition.topic == TopicConstants.GROUP_METADATA_TOPIC_NAME)
coordinator.handleGroupEmigration(partition.partitionId)
}
}
val responseHeader = new ResponseHeader(correlationId)
val leaderAndIsrResponse =
if (authorize(request.session, ClusterAction, Resource.ClusterResource)) {
val result = replicaManager.becomeLeaderOrFollower(correlationId, leaderAndIsrRequest, metadataCache, onLeadershipChange)
new LeaderAndIsrResponse(result.errorCode, result.responseMap.mapValues(new JShort(_)).asJava)
} else {
val result = leaderAndIsrRequest.partitionStates.asScala.keys.map((_, new JShort(Errors.CLUSTER_AUTHORIZATION_FAILED.code))).toMap
new LeaderAndIsrResponse(Errors.CLUSTER_AUTHORIZATION_FAILED.code, result.asJava)
}
requestChannel.sendResponse(new Response(request, new ResponseSend(request.connectionId, responseHeader, leaderAndIsrResponse)))
} catch {
case e: KafkaStorageException =>
fatal("Disk error during leadership change.", e)
Runtime.getRuntime.halt(1)
}
}
当leader成为__CONSUMER_OFFSETS分区的leader时,会回调handleGroupImmigration方法进行加载,此方法会委托loadGroupForPartition方法:
def loadGroupsAndOffsets() {
info("Loading offsets and group metadata from " + topicPartition)
loadingPartitions synchronized {
// 检测当前partition是否正在加载
if (loadingPartitions.contains(offsetsPartition)) {
info("Offset load from %s already in progress.".format(topicPartition))
return
} else {
loadingPartitions.add(offsetsPartition)
}
}
val startMs = time.milliseconds()
try {
// 取得这个partition的Log实例
replicaManager.logManager.getLog(topicPartition) match {
case Some(log) =>
// 从Log中第一个LogSegment开始加载
var currOffset = log.logSegments.head.baseOffset
// 创建缓冲区
val buffer = ByteBuffer.allocate(config.loadBufferSize)
// loop breaks if leader changes at any time during the load, since getHighWatermark is -1
// 加锁防止与deleteExpiredOffset()并发
inWriteLock(offsetExpireLock) {
val loadedGroups = mutable.Map[String, GroupMetadata]()
val removedGroups = mutable.Set[String]()
// 读取log结束的位置为HW
while (currOffset < getHighWatermark(offsetsPartition) && !shuttingDown.get()) {
buffer.clear()
// 读取log,返回的是分片的FileMessageSet对象
val messages = log.read(currOffset, config.loadBufferSize).messageSet.asInstanceOf[FileMessageSet]
messages.readInto(buffer, 0)
// 把消息读到内存,形成ByteBufferMessageSet对象。
val messageSet = new ByteBufferMessageSet(buffer)
messageSet.foreach { msgAndOffset => // 迭代消息集合
require(msgAndOffset.message.key != null, "Offset entry key should not be null")
val baseKey = GroupMetadataManager.readMessageKey(msgAndOffset.message.key)
if (baseKey.isInstanceOf[OffsetKey]) {
// load offset
// 读取记录offset信息的消息
val key = baseKey.key.asInstanceOf[GroupTopicPartition]
if (msgAndOffset.message.payload == null) {
//如果是删除标志,就删除offsetsCache中对应的OffsetAndMetadata
if (offsetsCache.remove(key) != null)
trace("Removed offset for %s due to tombstone entry.".format(key))
else
trace("Ignoring redundant tombstone for %s.".format(key))
} else {
// special handling for version 0:
// set the expiration time stamp as commit time stamp + server default retention time
// 不是删除消息,就解析value
val value = GroupMetadataManager.readOffsetMessageValue(msgAndOffset.message.payload)
// 向offsetsCache集合中添加对应的OffsetAndMetadata
putOffset(key, value.copy (
expireTimestamp = {
if (value.expireTimestamp == org.apache.kafka.common.requests.OffsetCommitRequest.DEFAULT_TIMESTAMP)
value.commitTimestamp + config.offsetsRetentionMs
else
value.expireTimestamp
}
))
trace("Loaded offset %s for %s.".format(value, key))
}
} else {
// load group metadata
// 读取到记录GorupMetadata信息的消息
val groupId = baseKey.key.asInstanceOf[String]
val groupMetadata = GroupMetadataManager.readGroupMessageValue(groupId, msgAndOffset.message.payload)
// 根据是否为删除标志消息,进行统计
if (groupMetadata != null) {
trace(s"Loaded group metadata for group ${groupMetadata.groupId} with generation ${groupMetadata.generationId}")
removedGroups.remove(groupId)
loadedGroups.put(groupId, groupMetadata)
} else {
loadedGroups.remove(groupId)
removedGroups.add(groupId)
}
}
currOffset = msgAndOffset.nextOffset
}
}
// 将需要加载的GroupMetadata信息加载到GroupCahce中
loadedGroups.values.foreach { group =>
val currentGroup = addGroup(group)
if (group != currentGroup)
debug(s"Attempt to load group ${group.groupId} from log with generation ${group.generationId} failed " +
s"because there is already a cached group with generation ${currentGroup.generationId}")
else
onGroupLoaded(group)
}
// 检测需要删除的GroupMetadata信息是否还在GroupsCahce集合中
removedGroups.foreach { groupId =>
val group = groupsCache.get(groupId)
if (group != null)
throw new IllegalStateException(s"Unexpected unload of acitve group ${group.groupId} while " +
s"loading partition ${topicPartition}")
}
}
if (!shuttingDown.get())
info("Finished loading offsets from %s in %d milliseconds."
.format(topicPartition, time.milliseconds() - startMs))
case None =>
warn("No log found for " + topicPartition)
}
}
catch {
case t: Throwable =>
error("Error in loading offsets from " + topicPartition, t)
}
finally {
// 将当前__CONSUMER_OFFSETS分区从loadingParitition移入ownedPartition集合中。
loadingPartitions synchronized {
ownedPartitions.add(offsetsPartition)
loadingPartitions.remove(offsetsPartition)
}
}
}
}
当Broker成为__CONSUMER_OFFSETS分区的follower副本会回调handleGroupEmigration方法进行清理工作。
def removeGroupsAndOffsets() {
var numOffsetsRemoved = 0
var numGroupsRemoved = 0
loadingPartitions synchronized {
// we need to guard the group removal in cache in the loading partition lock
// to prevent coordinator's check-and-get-group race condition
// 从ownedPartition集合中将对应的__CONSUMER_OFFSETS分区删除。
// 标志当前GroupCoordinator不再管管理其对应ConsumerGroup
ownedPartitions.remove(offsetsPartition)
// clear the offsets for this partition in the cache
/**
* NOTE: we need to put this in the loading partition lock as well to prevent race condition of the leader-is-local check
* in getOffsets to protects against fetching from an empty/cleared offset cache (i.e., cleared due to a leader->follower
* transition right after the check and clear the cache), causing offset fetch return empty offsets with NONE error code
*/
// 遍历offsetCache集合,将分区对应的OffsetAndMetadata全部删除
offsetsCache.keys.foreach { key =>
if (partitionFor(key.group) == offsetsPartition) {
offsetsCache.remove(key)
numOffsetsRemoved += 1
}
}
// clear the groups for this partition in the cache
// 遍历GroupCache集合,将分区对应的GroupMetadata全部删除
for (group <- groupsCache.values) {
if (partitionFor(group.groupId) == offsetsPartition) {
onGroupUnloaded(group)
groupsCache.remove(group.groupId, group)
numGroupsRemoved += 1
}
}
}
if (numOffsetsRemoved > 0) info("Removed %d cached offsets for %s on follower transition."
.format(numOffsetsRemoved, TopicAndPartition(TopicConstants.GROUP_METADATA_TOPIC_NAME, offsetsPartition)))
if (numGroupsRemoved > 0) info("Removed %d cached groups for %s on follower transition."
.format(numGroupsRemoved, TopicAndPartition(TopicConstants.GROUP_METADATA_TOPIC_NAME, offsetsPartition)))
}