步骤一:校验数据 (Producer -》 Kafka)
步骤二: 分配offset
步骤三:获取合法的数据
步骤四:获取一个可用的segment
步骤五:把数据写入到segment里面去
步骤六:更新LEO
步骤七:根据条件判断,然后把内存里面的数据写到磁盘
Partition.appendMessagesToLeader log.append()
def append(messages: ByteBufferMessageSet, assignOffsets: Boolean = true): LogAppendInfo = {
//步骤一:校验数据 (Producer -》 Kafka)
val appendInfo = analyzeAndValidateMessageSet(messages)
// if we have any valid messages, append them to the log
if (appendInfo.shallowCount == 0)
return appendInfo
// trim any invalid bytes or partial messages before appending it to the on-disk log
var validMessages = trimInvalidBytes(messages, appendInfo)
try {
// they are valid, insert them in the log
lock synchronized {
if (assignOffsets) {
//步骤二: 分配offset
val offset = new LongRef(nextOffsetMetadata.messageOffset)
appendInfo.firstOffset = offset.value
val now = time.milliseconds
val validateAndOffsetAssignResult = try {
validMessages.validateMessagesAndAssignOffsets(offset,
now,
appendInfo.sourceCodec,
appendInfo.targetCodec,
config.compact,
config.messageFormatVersion.messageFormatVersion,
config.messageTimestampType,
config.messageTimestampDifferenceMaxMs)
} catch {
case e: IOException => throw new KafkaException("Error in validating messages while appending to log '%s'".format(name), e)
}
//步骤三:获取合法的数据
validMessages = validateAndOffsetAssignResult.validatedMessages
appendInfo.maxTimestamp = validateAndOffsetAssignResult.maxTimestamp
appendInfo.offsetOfMaxTimestamp = validateAndOffsetAssignResult.offsetOfMaxTimestamp
appendInfo.lastOffset = offset.value - 1
if (config.messageTimestampType == TimestampType.LOG_APPEND_TIME)
appendInfo.logAppendTime = now
// re-validate message sizes if there's a possibility that they have changed (due to re-compression or message
// format conversion)
if (validateAndOffsetAssignResult.messageSizeMaybeChanged) {
for (messageAndOffset <- validMessages.shallowIterator) {
if (MessageSet.entrySize(messageAndOffset.message) > config.maxMessageSize) {
// we record the original message set size instead of the trimmed size
// to be consistent with pre-compression bytesRejectedRate recording
BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).bytesRejectedRate.mark(messages.sizeInBytes)
BrokerTopicStats.getBrokerAllTopicsStats.bytesRejectedRate.mark(messages.sizeInBytes)
throw new RecordTooLargeException("Message size is %d bytes which exceeds the maximum configured message size of %d."
.format(MessageSet.entrySize(messageAndOffset.message), config.maxMessageSize))
}
}
}
} else {
// we are taking the offsets we are given
if (!appendInfo.offsetsMonotonic || appendInfo.firstOffset < nextOffsetMetadata.messageOffset)
throw new IllegalArgumentException("Out of order offsets found in " + messages)
}
// check messages set size may be exceed config.segmentSize
if (validMessages.sizeInBytes > config.segmentSize) {
throw new RecordBatchTooLargeException("Message set size is %d bytes which exceeds the maximum configured segment size of %d."
.format(validMessages.sizeInBytes, config.segmentSize))
}
//步骤四:获取一个可用的segment
val segment = maybeRoll(messagesSize = validMessages.sizeInBytes,
maxTimestampInMessages = appendInfo.maxTimestamp)
//步骤五:把数据写入到segment里面去
segment.append(firstOffset = appendInfo.firstOffset, largestTimestamp = appendInfo.maxTimestamp,
offsetOfLargestTimestamp = appendInfo.offsetOfMaxTimestamp, messages = validMessages)
//步骤六:更新LEO
//LEO = lastOffset+1
updateLogEndOffset(appendInfo.lastOffset + 1)
trace("Appended message set to log %s with first offset: %d, next offset: %d, and messages: %s"
.format(this.name, appendInfo.firstOffset, nextOffsetMetadata.messageOffset, validMessages))
//步骤七:根据条件判断,然后把内存里面的数据写到磁盘
//假设我们配置的就是十分钟刷写一次磁盘
//LogMananger ->startUp -> 操作系统里面的机制
if (unflushedMessages >= config.flushInterval)
//永远不会执行这个操作,而是把 从内存里面刷写数据到磁盘这个
//操作就交给了操作系统,有操作系统去管理,操作系统里面会有一些
//机制,操作系统那儿也会定期的把数据写到磁盘里面。
//但是我们同样也可以配置刷写磁盘的频率。
//假设我们配置就是十分钟。那么这儿代码就是十分钟就有可能会被执行一次。
flush()
appendInfo
}
} catch {
case e: IOException => throw new KafkaStorageException("I/O exception in append to log '%s'".format(name), e)
}
}
步骤四:获取一个可用的segment
val segment = maybeRoll -> roll()
//TODO 把segment添加到某一个数据结构里面
//segments
val prev = addSegment(segment)
segments 数据结构
/**
* java juc下面的一个数据结构
* 首先这个是跳表实现的一个并发安全的Map集合。
* 文件名作为key(base offset)
* value就是一个segment
* 目的就是为了能我们根据offset的大小快速的定位到segment
*
* skip索引 0 2008 5678
* key 0 1005 2008 4000 5678
* segment segment1 segment2 segment3 segment4 segment5
*/
new ConcurrentSkipListMap[java.lang.Long, LogSegment]
步骤五:把数据写入到segment里面去
segment.append
def append(firstOffset: Long, largestTimestamp: Long, offsetOfLargestTimestamp: Long, messages: ByteBufferMessageSet) {
if (messages.sizeInBytes > 0) {
trace("Inserting %d bytes at offset %d at position %d with largest timestamp %d at offset %d"
.format(messages.sizeInBytes, firstOffset, log.sizeInBytes(), largestTimestamp, offsetOfLargestTimestamp))
val physicalPosition = log.sizeInBytes()
if (physicalPosition == 0)
rollingBasedTimestamp = Some(largestTimestamp)
//TODO 写数据到磁盘(内存)
log.append(messages)
// Update the in memory max timestamp and corresponding offset.
if (largestTimestamp > maxTimestampSoFar) {
maxTimestampSoFar = largestTimestamp
offsetOfMaxTimestamp = offsetOfLargestTimestamp
}
/**
* TODO 写索引
* 这儿有个条件,也就是说不是来一条数据就写一条索引
* 而是达到一定条件才会去写索引,
* 所以我们管这样的索引叫:稀松索引
* 当我们写数据(消息)写了4096字节的时候会写一条索引
* 换句话说就是每写4096字节的消息 会更新一条索引。
* 0 > 4096
*/
if(bytesSinceLastIndexEntry > indexIntervalBytes) {
//TODO 这个就是比较重要的。
index.append(firstOffset, physicalPosition)
//这个我们一般用不到,所以我们目前不关注
timeIndex.maybeAppend(maxTimestampSoFar, offsetOfMaxTimestamp)
//写完一条索引以后,又重新开始计算。
bytesSinceLastIndexEntry = 0
}
//bytesSinceLastIndexEntry += 当前消息的大小 -》 累加
bytesSinceLastIndexEntry += messages.sizeInBytes
}
}
-> index.append() 写索引
def append(offset: Long, position: Int) {
inLock(lock) {
require(!isFull, "Attempt to append to a full index (size = " + _entries + ").")
if (_entries == 0 || offset > _lastOffset) {
debug("Adding index entry %d => %d to %s.".format(offset, position, file.getName))
//NIO offset: 是逻辑上位置 0 1 2
//long:8字节
//int:4字节
mmap.putInt((offset - baseOffset).toInt)
//物理上的位置:你的这条消息会在磁盘的哪个位置
mmap.putInt(position)
//结论:我们写索引的时候,会记录两个位置:一个是逻辑上位置,就是我们平时说的那个offset(偏移量)
//还有一个就是物理位置,指的就是这个消息在磁盘的哪个位置。
_entries += 1
_lastOffset = offset
require(_entries * entrySize == mmap.position, entries + " entries but file position in index is " + mmap.position + ".")
} else {
throw new InvalidOffsetException("Attempt to append an offset (%d) to position %d no larger than the last offset appended (%d) to %s."
.format(offset, entries, _lastOffset, file.getAbsolutePath))
}
}
}
-> log.append(messages) 写数据到磁盘(内存)
FileChannel : 出于性能方面的考虑,操作系统会将数据存在内存中,所以无法保证写入到FileChannel的数据一定会及时写到磁盘上
要保证这一点,需要调用force() 方法
def append(messages: ByteBufferMessageSet) {
//TODO 关键写数据代码
val written = messages.writeFullyTo(channel)
_size.getAndAdd(written)
}
-> def writeFullyTo(channel: GatheringByteChannel): Int = {
//标记一下position的位置
buffer.mark()
var written = 0
//如果数据没写完,就一直写就可以了。
while (written < sizeInBytes)
//通过调用FileChannel去写数据
//又是javaNIO里面的知识,多次强调,如果NIO知识不太会的同学一定要去补一下。
written += channel.write(buffer)
//恢复之前标记position的位置
buffer.reset()
written
}
message格式
messages: ByteBufferMessageSet
class ByteBufferMessageSet(val buffer: ByteBuffer) extends MessageSet
abstract class MessageSet extends Iterable[MessageAndOffset]
case class MessageAndOffset(message: Message, offset: Long)
/**
* A message. The format of an N byte message is the following:
*
* 1. 4 byte CRC32 of the message 用来检验数据
* 2. 1 byte "magic" identifier to allow format changes, value is 0 or 1
* 0 offset 绝对的offset
* 1 offset是相对的offset
* 3. 1 byte "attributes" identifier to allow annotations on the message independent of the version
* 一字节是有8位
* bit 0 ~ 2 : Compression codec.
* 0 : no compression
* 1 : gzip
* 2 : snappy
* 3 : lz4
* bit 3 : Timestamp type
* 0 : create time
* 1 : log append time
* bit 4 ~ 7 : reserved
* 4. (Optional) 8 byte timestamp only if "magic" identifier is greater than 0
* 5. 4 byte key length, containing length K
* 6. K byte key
* 7. 4 byte payload length, containing length V
* 8. V byte payload
*/
class Message
步骤七:根据条件判断,然后把内存里面的数据写到磁盘
//步骤七:根据条件判断,然后把内存里面的数据写到磁盘
//假设我们配置的就是十分钟刷写一次磁盘
//LogMananger ->startUp -> 操作系统里面的机制
if (unflushedMessages >= config.flushInterval)
//config.flushInterval = Long.MaxValue
//永远不会执行这个操作,而是把 从内存里面刷写数据到磁盘这个操作
//就交给了操作系统,有操作系统去管理,操作系统里面会有一些机制,
//操作系统那儿也会定期的把数据写到磁盘里面。
//但是我们同样也可以配置刷写磁盘的频率。
//假设我们配置就是十分钟。那么这儿代码就是十分钟就有可能会被执行一次。
flush()
flush() -> flush(this.logEndOffset) -> segment.flush()
def flush() {
LogFlushStats.logFlushTimer.time {
//最主要关心的是这个。
log.flush()
//我们不关心
index.flush()
timeIndex.flush()
}
}
-> log.flush()
def flush() = {
//调用javaNIO的方法,强制把数据从内存里面刷写到磁盘
channel.force(true)
}