1、流程图
2、源码详解
Leader端:
KafkaApi.handleProducerRequest replicaManager.appendMessages
-> appendToLocalLog
// 把数据写到leader partition里面
-> partition.appendMessagesToLeader
-> log.append
-> segment.append(把数据写入到segment里面去)
2. 更新自己的LEO
-> updateLogEndOffset(更新LEO = lastOffset+1)
Follower端:
KafkaServer.startup replicaManager = new ReplicaManager
//添加follower partition去leader partition那儿拉取数据任务。
3、follower 同步数据,并带上自己的LEO
replicaFetcherManager.addFetcherForPartitions
Leader端:
KafkaApi.handleFetchRequest replicaManager.fetchMessages
//TODO leader partition这儿维护了这个partition的所有的replica的LEO值
updateFollowerLogReadResults
// 更新follower的LEO的值
partition.updateReplicaLogReadResult(replicaId, readResult){
4、更新所有的replica的LEO的值
replica.updateLogReadResult(logReadResult)
5、尝试修改ISR列表
maybeExpandIsr(replicaId)
}
def maybeExpandIsr(replicaId: Int) {
val leaderHWIncremented = inWriteLock(leaderIsrUpdateLock) {
// check if this replica needs to be added to the ISR
leaderReplicaIfLocal() match {
case Some(leaderReplica) =>
//获取到所有的replica
val replica = getReplica(replicaId).get
//获取到leader partition的HW的值
val leaderHW = leaderReplica.highWatermark
//判断一下是否要更新ISR列表
//!inSyncReplicas.contains(replica)如果这个replica 目前不在ISR列表之中
//TODO replica.logEndOffset.offsetDiff(leaderHW) >= 0
//这个replica的LEO的值要比leader partition的HW值要大
//说明这个replica已经跟leader partition数据保持同步了。
//所以把这个replica加入到ISR列表里面。
//ISR(p0 p1 p2)
if(!inSyncReplicas.contains(replica) &&
assignedReplicas.map(_.brokerId).contains(replicaId) &&
replica.logEndOffset.offsetDiff(leaderHW) >= 0) {
val newInSyncReplicas = inSyncReplicas + replica
info("Expanding ISR for partition [%s,%d] from %s to %s"
.format(topic, partitionId, inSyncReplicas.map(_.brokerId).mkString(","),
newInSyncReplicas.map(_.brokerId).mkString(",")))
// update ISR in ZK and cache
//就要更新ISR列表
updateIsr(newInSyncReplicas)
replicaManager.isrExpandRate.mark()
}
// check if the HW of the partition can now be incremented
// since the replica maybe now be in the ISR and its LEO has just incremented
6. 尝试更新HW的值
maybeIncrementLeaderHW(leaderReplica)
case None => false // nothing to do if no longer leader
}
}
handleFetchRequest.sendResponseCallback
7.返回数据的时候带上leader partition的HW
new FetchResponsePartitionData(data.error, data.hw, data.messages.asInstanceOf[FileMessageSet].toMessageFormat(Message.MagicValue_V0))
Follower 端:
AbstractFetcherThread.doWork.processFetchRequest
// 处理拉取到的数据
ReplicaFetcherThread.processPartitionData
def processPartitionData(topicPartition: TopicPartition, fetchOffset: Long, partitionData: PartitionData) {
try {
val topic = topicPartition.topic
val partitionId = topicPartition.partition
val replica = replicaMgr.getReplica(topic, partitionId).get
val messageSet = partitionData.toByteBufferMessageSet
maybeWarnIfMessageOversized(messageSet, topicPartition)
if (fetchOffset != replica.logEndOffset.messageOffset)
throw new RuntimeException("Offset mismatch for partition %s: fetched offset = %d, log end offset = %d.".format(topicPartition, fetchOffset, replica.logEndOffset.messageOffset))
if (logger.isTraceEnabled)
trace("Follower %d has replica log end offset %d for partition %s. Received %d messages and leader hw %d"
.format(replica.brokerId, replica.logEndOffset.messageOffset, topicPartition, messageSet.sizeInBytes, partitionData.highWatermark))
//TODO 调用跟leader partition一样的代码
//把数据写到磁盘上面
8.同时更新LEO的值
replica.log.get.append(messageSet, assignOffsets = false)
if (logger.isTraceEnabled)
trace("Follower %d has replica log end offset %d after appending %d bytes of messages for partition %s"
.format(replica.brokerId, replica.logEndOffset.messageOffset, messageSet.sizeInBytes, topicPartition))
//partitionData.highWatermark:leader parttion的HW的的值
9.作为自己的HW的=min(logEndOffset.HW)
val followerHighWatermark = replica.logEndOffset.messageOffset.min(partitionData.highWatermark)
// for the follower replica, we do not need to keep
// its segment base offset the physical position,
// these values will be computed upon making the leader
//修改自己的HW的值
replica.highWatermark = new LogOffsetMetadata(followerHighWatermark)
if (logger.isTraceEnabled)
trace("Follower %d set replica high watermark for partition [%s,%d] to %s"
.format(replica.brokerId, topic, partitionId, followerHighWatermark))
if (quota.isThrottled(new TopicAndPartition(topic, partitionId)))
quota.record(messageSet.sizeInBytes)
} catch {
case e: KafkaStorageException =>
fatal(s"Disk error while replicating data for $topicPartition", e)
Runtime.getRuntime.halt(1)
}
}