kafka 存储-Log append

最新推荐文章于 2024-09-13 21:56:54 发布

画画的老顽童

最新推荐文章于 2024-09-13 21:56:54 发布

阅读量607

点赞数

分类专栏： kafka

本文链接：https://blog.youkuaiyun.com/m0_46449152/article/details/115048492

版权

kafka 专栏收录该内容

36 篇文章

订阅专栏

步骤一：校验数据（Producer -》 Kafka）
步骤二：分配offset
步骤三：获取合法的数据
步骤四：获取一个可用的segment
步骤五：把数据写入到segment里面去
步骤六：更新LEO
步骤七：根据条件判断，然后把内存里面的数据写到磁盘

Partition.appendMessagesToLeader log.append()
  def append(messages: ByteBufferMessageSet, assignOffsets: Boolean = true): LogAppendInfo = {

    //步骤一：校验数据 （Producer -》 Kafka）
    val appendInfo = analyzeAndValidateMessageSet(messages)

    // if we have any valid messages, append them to the log
    if (appendInfo.shallowCount == 0)
      return appendInfo

    // trim any invalid bytes or partial messages before appending it to the on-disk log
    var validMessages = trimInvalidBytes(messages, appendInfo)

    try {
      // they are valid, insert them in the log
      lock synchronized {

        if (assignOffsets) {

          //步骤二： 分配offset
          val offset = new LongRef(nextOffsetMetadata.messageOffset)
          appendInfo.firstOffset = offset.value
          val now = time.milliseconds
          val validateAndOffsetAssignResult = try {
            validMessages.validateMessagesAndAssignOffsets(offset,
                                                           now,
                                                           appendInfo.sourceCodec,
                                                           appendInfo.targetCodec,
                                                           config.compact,
                                                           config.messageFormatVersion.messageFormatVersion,
                                                           config.messageTimestampType,
                                                           config.messageTimestampDifferenceMaxMs)
          } catch {
            case e: IOException => throw new KafkaException("Error in validating messages while appending to log '%s'".format(name), e)
          }

          //步骤三：获取合法的数据
          validMessages = validateAndOffsetAssignResult.validatedMessages
          appendInfo.maxTimestamp = validateAndOffsetAssignResult.maxTimestamp
          appendInfo.offsetOfMaxTimestamp = validateAndOffsetAssignResult.offsetOfMaxTimestamp
          appendInfo.lastOffset = offset.value - 1
          if (config.messageTimestampType == TimestampType.LOG_APPEND_TIME)
            appendInfo.logAppendTime = now

          // re-validate message sizes if there's a possibility that they have changed (due to re-compression or message
          // format conversion)
          if (validateAndOffsetAssignResult.messageSizeMaybeChanged) {
            for (messageAndOffset <- validMessages.shallowIterator) {
              if (MessageSet.entrySize(messageAndOffset.message) > config.maxMessageSize) {
                // we record the original message set size instead of the trimmed size
                // to be consistent with pre-compression bytesRejectedRate recording
                BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).bytesRejectedRate.mark(messages.sizeInBytes)
                BrokerTopicStats.getBrokerAllTopicsStats.bytesRejectedRate.mark(messages.sizeInBytes)
                throw new RecordTooLargeException("Message size is %d bytes which exceeds the maximum configured message size of %d."
                  .format(MessageSet.entrySize(messageAndOffset.message), config.maxMessageSize))
              }
            }
          }

        } else {
          // we are taking the offsets we are given
          if (!appendInfo.offsetsMonotonic || appendInfo.firstOffset < nextOffsetMetadata.messageOffset)
            throw new IllegalArgumentException("Out of order offsets found in " + messages)
        }

        // check messages set size may be exceed config.segmentSize
        if (validMessages.sizeInBytes > config.segmentSize) {
          throw new RecordBatchTooLargeException("Message set size is %d bytes which exceeds the maximum configured segment size of %d."
            .format(validMessages.sizeInBytes, config.segmentSize))
        }

        //步骤四：获取一个可用的segment
        val segment = maybeRoll(messagesSize = validMessages.sizeInBytes,
                                maxTimestampInMessages = appendInfo.maxTimestamp)

        //步骤五：把数据写入到segment里面去
        segment.append(firstOffset = appendInfo.firstOffset, largestTimestamp = appendInfo.maxTimestamp,
          offsetOfLargestTimestamp = appendInfo.offsetOfMaxTimestamp, messages = validMessages)

        //步骤六：更新LEO
        //LEO = lastOffset+1
        updateLogEndOffset(appendInfo.lastOffset + 1)

        trace("Appended message set to log %s with first offset: %d, next offset: %d, and messages: %s"
          .format(this.name, appendInfo.firstOffset, nextOffsetMetadata.messageOffset, validMessages))

        //步骤七：根据条件判断，然后把内存里面的数据写到磁盘
        //假设我们配置的就是十分钟刷写一次磁盘
        //LogMananger  ->startUp -> 操作系统里面的机制
        if (unflushedMessages >= config.flushInterval)
          //永远不会执行这个操作，而是把 从内存里面刷写数据到磁盘这个
          //操作就交给了操作系统，有操作系统去管理，操作系统里面会有一些
          //机制，操作系统那儿也会定期的把数据写到磁盘里面。

          //但是我们同样也可以配置刷写磁盘的频率。
          //假设我们配置就是十分钟。那么这儿代码就是十分钟就有可能会被执行一次。
          flush()

        appendInfo
      }
    } catch {
      case e: IOException => throw new KafkaStorageException("I/O exception in append to log '%s'".format(name), e)
    }
  }

步骤四：获取一个可用的segment

 val segment = maybeRoll  -> roll()
 //TODO 把segment添加到某一个数据结构里面
      //segments
      val prev = addSegment(segment)

segments 数据结构
  /**
   * java juc下面的一个数据结构
   * 首先这个是跳表实现的一个并发安全的Map集合。
   * 文件名作为key（base offset）
   * value就是一个segment
   * 目的就是为了能我们根据offset的大小快速的定位到segment
   *
   * skip索引      0                      2008                5678
   *   key         0           1005       2008     4000       5678
   *  segment      segment1    segment2  segment3  segment4  segment5
   */
    new ConcurrentSkipListMap[java.lang.Long, LogSegment]

步骤五：把数据写入到segment里面去

segment.append
  def append(firstOffset: Long, largestTimestamp: Long, offsetOfLargestTimestamp: Long, messages: ByteBufferMessageSet) {
    if (messages.sizeInBytes > 0) {
      trace("Inserting %d bytes at offset %d at position %d with largest timestamp %d at offset %d"
          .format(messages.sizeInBytes, firstOffset, log.sizeInBytes(), largestTimestamp, offsetOfLargestTimestamp))
      val physicalPosition = log.sizeInBytes()
      if (physicalPosition == 0)
        rollingBasedTimestamp = Some(largestTimestamp)

      //TODO 写数据到磁盘（内存）
      log.append(messages)
      // Update the in memory max timestamp and corresponding offset.
      if (largestTimestamp > maxTimestampSoFar) {
        maxTimestampSoFar = largestTimestamp
        offsetOfMaxTimestamp = offsetOfLargestTimestamp
      }

      /**
       * TODO 写索引
       * 这儿有个条件，也就是说不是来一条数据就写一条索引
       * 而是达到一定条件才会去写索引，
       * 所以我们管这样的索引叫：稀松索引
       * 当我们写数据（消息）写了4096字节的时候会写一条索引
       * 换句话说就是每写4096字节的消息 会更新一条索引。
       * 0 > 4096
       */
      if(bytesSinceLastIndexEntry > indexIntervalBytes) {
        //TODO 这个就是比较重要的。
        index.append(firstOffset, physicalPosition)
        //这个我们一般用不到，所以我们目前不关注
        timeIndex.maybeAppend(maxTimestampSoFar, offsetOfMaxTimestamp)
        //写完一条索引以后，又重新开始计算。
        bytesSinceLastIndexEntry = 0
      }
      //bytesSinceLastIndexEntry += 当前消息的大小  -》 累加
      bytesSinceLastIndexEntry += messages.sizeInBytes
    }
  }

-> index.append()  写索引
  def append(offset: Long, position: Int) {
    inLock(lock) {
      require(!isFull, "Attempt to append to a full index (size = " + _entries + ").")
      if (_entries == 0 || offset > _lastOffset) {
        debug("Adding index entry %d => %d to %s.".format(offset, position, file.getName))

        //NIO offset: 是逻辑上位置   0 1 2

        //long:8字节
        //int:4字节
        mmap.putInt((offset - baseOffset).toInt)
        //物理上的位置：你的这条消息会在磁盘的哪个位置
        mmap.putInt(position)

        //结论：我们写索引的时候，会记录两个位置：一个是逻辑上位置，就是我们平时说的那个offset（偏移量）
        //还有一个就是物理位置，指的就是这个消息在磁盘的哪个位置。
        _entries += 1
        _lastOffset = offset
        require(_entries * entrySize == mmap.position, entries + " entries but file position in index is " + mmap.position + ".")
      } else {
        throw new InvalidOffsetException("Attempt to append an offset (%d) to position %d no larger than the last offset appended (%d) to %s."
          .format(offset, entries, _lastOffset, file.getAbsolutePath))
      }
    }
  }

-> log.append(messages)  写数据到磁盘（内存） 
   FileChannel : 出于性能方面的考虑，操作系统会将数据存在内存中，所以无法保证写入到FileChannel的数据一定会及时写到磁盘上
                 要保证这一点，需要调用force() 方法
                 
  def append(messages: ByteBufferMessageSet) {
    //TODO 关键写数据代码
    val written = messages.writeFullyTo(channel)
    _size.getAndAdd(written)
  }
  -> def writeFullyTo(channel: GatheringByteChannel): Int = {
    //标记一下position的位置
    buffer.mark()
    var written = 0
    //如果数据没写完，就一直写就可以了。
    while (written < sizeInBytes)

      //通过调用FileChannel去写数据
      //又是javaNIO里面的知识，多次强调，如果NIO知识不太会的同学一定要去补一下。
      written += channel.write(buffer)

    //恢复之前标记position的位置
    buffer.reset()
    written
  }

message格式

messages: ByteBufferMessageSet
class ByteBufferMessageSet(val buffer: ByteBuffer) extends MessageSet 
abstract class MessageSet extends Iterable[MessageAndOffset] 
case class MessageAndOffset(message: Message, offset: Long) 
/**
 * A message. The format of an N byte message is the following:
 *
 * 1. 4 byte CRC32 of the message  用来检验数据
 * 2. 1 byte "magic" identifier to allow format changes, value is 0 or 1
 *     0 offset 绝对的offset
 *     1 offset是相对的offset
 * 3. 1 byte "attributes" identifier to allow annotations on the message independent of the version
 *    一字节是有8位
 *    bit 0 ~ 2 : Compression codec.
 *      0 : no compression
 *      1 : gzip
 *      2 : snappy
 *      3 : lz4
 *    bit 3 : Timestamp type
 *      0 : create time
 *      1 : log append time
 *    bit 4 ~ 7 : reserved
 * 4. (Optional) 8 byte timestamp only if "magic" identifier is greater than 0
 * 5. 4 byte key length, containing length K
 * 6. K byte key
 * 7. 4 byte payload length, containing length V
 * 8. V byte payload
 */
class Message

步骤七：根据条件判断，然后把内存里面的数据写到磁盘

        //步骤七：根据条件判断，然后把内存里面的数据写到磁盘
        //假设我们配置的就是十分钟刷写一次磁盘
        //LogMananger  ->startUp -> 操作系统里面的机制
        if (unflushedMessages >= config.flushInterval)

          //config.flushInterval = Long.MaxValue
          //永远不会执行这个操作，而是把 从内存里面刷写数据到磁盘这个操作
          //就交给了操作系统，有操作系统去管理，操作系统里面会有一些机制，
          //操作系统那儿也会定期的把数据写到磁盘里面。

          //但是我们同样也可以配置刷写磁盘的频率。
          //假设我们配置就是十分钟。那么这儿代码就是十分钟就有可能会被执行一次。
          flush()
flush() -> flush(this.logEndOffset) -> segment.flush()
  def flush() {
    LogFlushStats.logFlushTimer.time {
      //最主要关心的是这个。
      log.flush()
      //我们不关心
      index.flush()
      timeIndex.flush()
    }
  }
-> log.flush()
  def flush() = {
    //调用javaNIO的方法，强制把数据从内存里面刷写到磁盘
    channel.force(true)
  }