ReplicaStateMachine 定义了 Kafka Controller 的副本状态机,用于管理集群中副本的状态信息,每个 Kafka Controller 都定义了自己的副本状态机,但是只有在当前 Controller 实例成为 leader 角色时才会启动运行名下的状态机。
副本状态及状态管理流程副本状态机一旦被启动,就意味着它要行使它最重要的职责了:管理副本状态的转换。
它的启动逻辑如下:
def startup(): Unit = {
info("Initializing replica state")
// 初始化每个 topic 分区 AR 集合中的副本状态
initializeReplicaState()
info("Triggering online replica state changes")
// 尝试将所有可用副本切换成 OnlineReplica 状态
val (onlineReplicas, offlineReplicas) = controllerContext.onlineAndOfflineReplicas
handleStateChanges(onlineReplicas.toSeq, OnlineReplica)
info("Triggering offline replica state changes")
handleStateChanges(offlineReplicas.toSeq, OfflineReplica)
debug(s"Started replica state machine with initial state -> ${controllerContext.replicaStates}")
}
private def initializeReplicaState(): Unit = {
controllerContext.allPartitions.foreach { partition =>
val replicas = controllerContext.partitionReplicaAssignment(partition)
// 遍历每个分区的 AR 集合,初始化对应分区的状态
replicas.foreach { replicaId =>
val partitionAndReplica = PartitionAndReplica(partition, replicaId)
// 将可用的副本初始化为 OnlineReplica 状态
if (controllerContext.isReplicaOnline(replicaId, partition)) {
controllerContext.putReplicaState(partitionAndReplica, OnlineReplica)
} else {
// mark replicas on dead brokers as failed for topic deletion, if they belong to a topic to be deleted.
// This is required during controller failover since during controller failover a broker can go down,
// so the replicas on that broker should be moved to ReplicaDeletionIneligible to be on the safer side.
// 将不可用的副本初始化为 ReplicaDeletionIneligible 状态
controllerContext.putReplicaState(partitionAndReplica, ReplicaDeletionIneligible)
}
}
}
}
副本状态主要有以下7个
- NewReplica:副本被创建之后所处的状态。
- OnlineReplica:副本正常提供服务时所处的状态。
- OfflineReplica:副本服务下线时所处的状态。
- ReplicaDeletionStarted:副本被删除时所处的状态。
- ReplicaDeletionSuccessful:副本被成功删除后所处的状态。
- ReplicaDeletionIneligible:开启副本删除,但副本暂时无法被删除时所处的状态。
- NonExistentReplica:副本从副本状态机被移除前所处的状态。

handleStateChanges方法对副本进行状态转移的接口。
override def handleStateChanges(replicas: Seq[PartitionAndReplica], targetState: ReplicaState): Unit = {
if (replicas.nonEmpty) {
try {
// 清空Controller待发送请求集合,准备本次请求发送
// 校验待发送的请求集合,确保历史的请求已经全部发送完毕
controllerBrokerRequestBatch.newBatch()
// 将所有副本对象按照Broker进行分组,依次执行状态转换操作
// 调用 doHandleStateChanges 方法执行真正的副本状态转换
replicas.groupBy(_.replica).foreach { case (replicaId, replicas) =>
doHandleStateChanges(replicaId, replicas, targetState)
}
// 发送对应的Controller请求给Broker
controllerBrokerRequestBatch.sendRequestsToBrokers(controllerContext.epoch)
} catch {
// 如果Controller易主,则记录错误日志然后抛出异常
case e: ControllerMovedException =>
error(s"Controller moved to another broker when moving some replicas to $targetState state", e)
throw e
case e: Throwable => error(s"Error while moving some replicas to $targetState state", e)
}
}
}
doHandleStateChanges执行真的副本转换的逻辑。
private def doHandleStateChanges(replicaId: Int, replicas: Seq[PartitionAndReplica], targetState: ReplicaState): Unit = {
replicas.foreach(replica => controllerContext.putReplicaStateIfNotExists(replica, NonExistentReplica))
val (validReplicas, invalidReplicas) = controllerContext.checkValidReplicaStateChange(replicas, targetState)
invalidReplicas.foreach(replica => logInvalidTransition(replica, targetState))
targetState match {
// 切换到NewReplica
case NewReplica =>
// 遍历所有能够执行转换的副本对象
validReplicas.foreach { replica =>
// 遍历所有能够执行转换的副本对象
val partition = replica.topicPartition
// 获取副本对象的当前状态
val currentState = controllerContext.replicaState(replica)
// 尝试从元数据缓存中获取该分区当前信息
// 包括Leader是谁、ISR都有哪些副本等数据
controllerContext.partitionLeadershipInfo.get(partition) match {
// 如果成功拿到分区数据信息
case Some(leaderIsrAndControllerEpoch) =>
// 如果该副本是Leader副本
if (leaderIsrAndControllerEpoch.leaderAndIsr.leader == replicaId) {
val exception = new StateChangeFailedException(s"Replica $replicaId for partition $partition cannot be moved to NewReplica state as it is being requested to become leader")
// 记录错误日志。Leader副本不能被设置成NewReplica状态
logFailedStateChange(replica, currentState, OfflineReplica, exception)
} else {
// 否则,给该副本所在的Broker发送LeaderAndIsrRequest
// 向它同步该分区的数据, 之后给集群当前所有Broker发送
// UpdateMetadataRequest通知它们该分区数据发生变更
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(Seq(replicaId),
replica.topicPartition,
leaderIsrAndControllerEpoch,
controllerContext.partitionFullReplicaAssignment(replica.topicPartition),
isNew = true)
logSuccessfulTransition(replicaId, partition, currentState, NewReplica)
// 更新元数据缓存中该副本对象的当前状态为NewReplica
controllerContext.putReplicaState(replica, NewReplica)
}
// 如果没有相应数据, 当前节点不分配此分区副本
case None =>
logSuccessfulTransition(replicaId, partition, currentState, NewReplica)
// 仅仅更新元数据缓存中该副本对象的当前状态为NewReplica即可
controllerContext.putReplicaState(replica, NewReplica)
}
}
case OnlineReplica =>
// 遍历所有能够执行转换的副本对象
validReplicas.foreach { replica =>
// 获取该副本对象的分区对象,即<主题名,分区号>数据
val partition = replica.topicPartition
// 获取副本对象的当前状态
val currentState = controllerContext.replicaState(replica)
currentState match {
// 如果当前状态是NewReplica
case NewReplica =>
// 从元数据缓存中拿到分区副本列表
val assignment = controllerContext.partitionReplicaAssignment(partition)
// 如果副本列表不包含当前副本,视为异常情况
if (!assignment.contains(replicaId)) {
// 将该副本加入到副本列表中,并更新元数据缓存中该分区的副本列表
controllerContext.updatePartitionReplicaAssignment(partition, assignment :+ replicaId)
}
case _ =>
// 尝试获取该分区当前信息数据
controllerContext.partitionLeadershipInfo.get(partition) match {
// 如果存在分区信息
// 向该副本对象所在Broker发送请求,令其同步该分区数据
case Some(leaderIsrAndControllerEpoch) =>
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(Seq(replicaId),
replica.topicPartition,
leaderIsrAndControllerEpoch,
controllerContext.partitionFullReplicaAssignment(partition), isNew = false)
case None =>
}
}
logSuccessfulTransition(replicaId, partition, currentState, OnlineReplica)
// 将该副本对象设置成OnlineReplica状态
controllerContext.putReplicaState(replica, OnlineReplica)
}
case OfflineReplica =>
validReplicas.foreach { replica =>
// 向副本所在Broker发送StopReplicaRequest请求,停止对应副本
controllerBrokerRequestBatch.addStopReplicaRequestForBrokers(Seq(replicaId), replica.topicPartition, deletePartition = false)
}
// 将副本对象集合划分成有Leader信息的副本集合和无Leader信息的副本集合
val (replicasWithLeadershipInfo, replicasWithoutLeadershipInfo) = validReplicas.partition { replica =>
controllerContext.partitionLeadershipInfo.contains(replica.topicPartition)
}
// 对于有Leader信息的副本集合而言,从它们对应的所有分区中移除该副本对象并更新ZooKeeper节点
val updatedLeaderIsrAndControllerEpochs = removeReplicasFromIsr(replicaId, replicasWithLeadershipInfo.map(_.topicPartition))
// 遍历每个更新过的分区信息
updatedLeaderIsrAndControllerEpochs.foreach { case (partition, leaderIsrAndControllerEpoch) =>
// 如果分区对应主题并未被删除
if (!controllerContext.isTopicQueuedUpForDeletion(partition.topic)) {
// 获取该分区除给定副本以外的其他副本所在的Broker
val recipients = controllerContext.partitionReplicaAssignment(partition).filterNot(_ == replicaId)
// 向这些Broker发送请求更新该分区更新过的分区LeaderAndIsr数据
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(recipients,
partition,
leaderIsrAndControllerEpoch,
controllerContext.partitionFullReplicaAssignment(partition), isNew = false)
}
val replica = PartitionAndReplica(partition, replicaId)
val currentState = controllerContext.replicaState(replica)
logSuccessfulTransition(replicaId, partition, currentState, OfflineReplica)
// 设置该分区给定副本的状态为OfflineReplica
controllerContext.putReplicaState(replica, OfflineReplica)
}
// 遍历无Leader信息的所有副本对象
replicasWithoutLeadershipInfo.foreach { replica =>
val currentState = controllerContext.replicaState(replica)
logSuccessfulTransition(replicaId, replica.topicPartition, currentState, OfflineReplica)
// 向集群所有Broker发送请求,更新对应分区的元数据
controllerBrokerRequestBatch.addUpdateMetadataRequestForBrokers(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set(replica.topicPartition))
// 设置该分区给定副本的状态为OfflineReplica
controllerContext.putReplicaState(replica, OfflineReplica)
}
case ReplicaDeletionStarted =>
validReplicas.foreach { replica =>
val currentState = controllerContext.replicaState(replica)
logSuccessfulTransition(replicaId, replica.topicPartition, currentState, ReplicaDeletionStarted)
// 设置该分区给定副本的状态为ReplicaDeletionStarted
controllerContext.putReplicaState(replica, ReplicaDeletionStarted)
// 向副本所在Broker发送StopReplicaRequest请求,停止对应副本
controllerBrokerRequestBatch.addStopReplicaRequestForBrokers(Seq(replicaId), replica.topicPartition, deletePartition = true)
}
case ReplicaDeletionIneligible =>
validReplicas.foreach { replica =>
val currentState = controllerContext.replicaState(replica)
logSuccessfulTransition(replicaId, replica.topicPartition, currentState, ReplicaDeletionIneligible)
// 设置该分区给定副本的状态为ReplicaDeletionIneligible
controllerContext.putReplicaState(replica, ReplicaDeletionIneligible)
}
case ReplicaDeletionSuccessful =>
validReplicas.foreach { replica =>
val currentState = controllerContext.replicaState(replica)
logSuccessfulTransition(replicaId, replica.topicPartition, currentState, ReplicaDeletionSuccessful)
// 设置该分区给定副本的状态为ReplicaDeletionIneligible
controllerContext.putReplicaState(replica, ReplicaDeletionSuccessful)
}
case NonExistentReplica =>
validReplicas.foreach { replica =>
val currentState = controllerContext.replicaState(replica)
// // 从元数据缓存中拿到分区副本列表
val currentAssignedReplicas = controllerContext.partitionReplicaAssignment(replica.topicPartition)
// 更新元数据缓存中该分区的副本列表,排除掉要置成NonExistentReplica的副本
controllerContext.updatePartitionReplicaAssignment(replica.topicPartition, currentAssignedReplicas.filterNot(_ == replica.replica))
logSuccessfulTransition(replicaId, replica.topicPartition, currentState, NonExistentReplica)
// 在集群原数据中,删除此副本信息
controllerContext.removeReplicaState(replica)
}
}
}
本文详细解读了KafkaController的ReplicaStateMachine,介绍了其启动流程、状态管理关键步骤,包括NewReplica到Online/Offline/Deletion等状态的转换逻辑。重点展示了handleStateChanges方法如何驱动状态转移和Controller请求发送。
1810

被折叠的 条评论
为什么被折叠?



