Kafka 生产者模块（四）：KafkaProducer 的 sender线程之sendProducerData方法

最新推荐文章于 2024-07-17 22:14:25 发布

原创最新推荐文章于 2024-07-17 22:14:25 发布 · 682 阅读

1 ·

CC 4.0 BY-SA版权

kafka-2.4.1 专栏收录该内容

55 篇文章

订阅专栏

sendProducerData把实际要发的消息封装好，放入KakfaNetworkClient中。

 private long sendProducerData(long now) {
    // 1. 计算需要以及可以向哪些节点发送请求
    Cluster cluster = metadata.fetch();
    // get the list of partitions with data ready to send
    // 计算需要向哪些节点发送请求
    RecordAccumulator.ReadyCheckResult result = this.accumulator.ready(cluster, now);

    // if there are any partitions whose leaders are not known yet, force metadata update
    // 2. 如果存在未知的 leader 副本对应的节点（对应的 topic 分区正在执行 leader 选举，或者对应的 topic 已经失效），
    // 标记需要更新缓存的集群元数据信息
    if (!result.unknownLeaderTopics.isEmpty()) {
        // The set of topics with unknown leader contains topics with leader election pending as well as
        // topics which may have expired. Add the topic again to metadata to ensure it is included
        // and request metadata update, since there are messages to send to the topic.
        for (String topic : result.unknownLeaderTopics)
            this.metadata.add(topic);

        log.debug("Requesting metadata update due to unknown leader topics from the batched records: {}",
            result.unknownLeaderTopics);
        this.metadata.requestUpdate();
    }

    // remove any nodes we aren't ready to send to
    // 3. 遍历处理待发送请求的目标节点，基于网络 IO 检查对应节点是否可用，对于不可用的节点则剔除
    Iterator<Node> iter = result.readyNodes.iterator();
    long notReadyTimeout = Long.MAX_VALUE;
    while (iter.hasNext()) {
        Node node = iter.next();
        // 检查目标节点是否准备好接收请求，如果未准备好但目标节点允许创建连接，则创建到目标节点的连接
        if (!this.client.ready(node, now)) {
            // 对于未准备好的节点，则从 ready 集合中删除
            iter.remove();
            notReadyTimeout = Math.min(notReadyTimeout, this.client.pollDelayMs(node, now));
        }
    }

    // create produce requests
    // 4. 获取每个节点待发送消息集合，其中 key 是目标 leader 副本所在节点 ID
    Map<Integer, List<ProducerBatch>> batches = this.accumulator.drain(cluster, result.readyNodes, this.maxRequestSize, now);
    addToInflightBatches(batches);
    if (guaranteeMessageOrder) {
        // 5. 如果需要保证消息的强顺序性，则缓存对应 topic 分区对象，防止同一时间往同一个 topic 分区发送多条处于未完成状态的消息
        // Mute all the partitions drained
        // 将所有 RecordBatch 的 topic 分区对象加入到 muted 集合中
        // 防止同一时间往同一个 topic 分区发送多条处于未完成状态的消息
        for (List<ProducerBatch> batchList : batches.values()) {
            for (ProducerBatch batch : batchList)
                this.accumulator.mutePartition(batch.topicPartition);
        }
    }
    // 6. 处理本地过期的消息，返回 TimeoutException，并释放空间
    accumulator.resetNextBatchExpiryTime();
    List<ProducerBatch> expiredInflightBatches = getExpiredInflightBatches(now);
    List<ProducerBatch> expiredBatches = this.accumulator.expiredBatches(now);
    expiredBatches.addAll(expiredInflightBatches);

    // Reset the producer id if an expired batch has previously been sent to the broker. Also update the metrics
    // for expired batches. see the documentation of @TransactionState.resetProducerId to understand why
    // we need to reset the producer id here.
    if (!expiredBatches.isEmpty())
        log.trace("Expired {} batches in accumulator", expiredBatches.size());
    for (ProducerBatch expiredBatch : expiredBatches) {
        String errorMessage = "Expiring " + expiredBatch.recordCount + " record(s) for " + expiredBatch.topicPartition
            + ":" + (now - expiredBatch.createdMs) + " ms has passed since batch creation";
        failBatch(expiredBatch, -1, NO_TIMESTAMP, new TimeoutException(errorMessage), false);
        if (transactionManager != null && expiredBatch.inRetry()) {
            // This ensures that no new batches are drained until the current in flight batches are fully resolved.
            transactionManager.markSequenceUnresolved(expiredBatch.topicPartition);
        }
    }
    sensors.updateProduceRequestMetrics(batches);

    // If we have any nodes that are ready to send + have sendable data, poll with 0 timeout so this can immediately
    // loop and try sending more data. Otherwise, the timeout will be the smaller value between next batch expiry
    // time, and the delay time for checking data availability. Note that the nodes may have data that isn't yet
    // sendable due to lingering, backing off, etc. This specifically does not include nodes with sendable data
    // that aren't ready to send since they would cause busy looping.
    // 如果存在待发送的消息，则设置 pollTimeout 等于 0，这样可以立即发送请求，从而能够缩短剩余消息的缓存时间，避免堆积
    long pollTimeout = Math.min(result.nextReadyCheckDelayMs, notReadyTimeout);
    pollTimeout = Math.min(pollTimeout, this.accumulator.nextExpiryTimeMs() - now);
    pollTimeout = Math.max(pollTimeout, 0);
    if (!result.readyNodes.isEmpty()) {
        log.trace("Nodes with data ready to send: {}", result.readyNodes);
        // if some partitions are already ready to be sent, the select time would be 0;
        // otherwise if some partition already has some data accumulated but not ready yet,
        // the select time will be the time difference between now and its linger expiry time;
        // otherwise the select time will be the time difference between now and the metadata expiry time;
        pollTimeout = 0;
    }
    // 7. 发送请求到服务端，并处理服务端响应
    sendProduceRequests(batches, now);
    return pollTimeout;
}

RecordAccumulator#ready

步骤 1 ，该步骤用于计算需要向哪些节点投递消息在 RecordAccumulator#ready 方法中：

public ReadyCheckResult ready(Cluster cluster, long nowMs) {
    // 用于记录接收请求的节点
    Set<Node> readyNodes = new HashSet<>();
    // 记录下次执行 ready 判断的时间间隔
    long nextReadyCheckDelayMs = Long.MAX_VALUE;
    // 记录找不到 leader 副本的分区对应的 topic 集合
    Set<String> unknownLeaderTopics = new HashSet<>();
    // 是否有线程在等待 BufferPool 分配空间
    boolean exhausted = this.free.queued() > 0;
    // 遍历每个 topic 分区及其 RecordBatch 队列，对每个分区的 leader 副本所在的节点执行判定
    for (Map.Entry<TopicPartition, Deque<ProducerBatch>> entry : this.batches.entrySet()) {
        Deque<ProducerBatch> deque = entry.getValue();
        synchronized (deque) {
            // When producing to a large number of partitions, this path is hot and deques are often empty.
            // We check whether a batch exists first to avoid the more expensive checks whenever possible.
            ProducerBatch batch = deque.peekFirst();
            if (batch != null) {
                TopicPartition part = entry.getKey();
                // 获取当前 topic 分区 leader 副本所在的节点
                Node leader = cluster.leaderFor(part);
                // 当前分区 leader 副本未知，但存在发往该分区的消息
                if (leader == null) {
                    // This is a partition for which leader is not known, but messages are available to send.
                    // Note that entries are currently not removed from batches when deque is empty.
                    unknownLeaderTopics.add(part.topic());
                } else if (!readyNodes.contains(leader) && !isMuted(part, nowMs)) {
                    // 如果需要保证消息顺序性，则不应该存在多个发往该 leader 副本节点且未完成的消息
                    long waitedTimeMs = batch.waitedTimeMs(nowMs);
                    // 当前为重试操作，且重试时间间隔未达到阈值时间
                    boolean backingOff = batch.attempts() > 0 && waitedTimeMs < retryBackoffMs;
                    long timeToWaitMs = backingOff ? retryBackoffMs : lingerMs;
                    boolean full = deque.size() > 1 || batch.isFull();
                    boolean expired = waitedTimeMs >= timeToWaitMs;
                    // 标记当前节点是否可以接收请求，如果满足其中一个则认为需要往目标节点投递消息：
                    boolean sendable = full // 1. 队列中有多个 RecordBatch，或第一个 RecordBatch 已满
                            || expired // 2. 当前等待重试的时间过长
                            || exhausted // 3. 有其他线程在等待 BufferPool 分配空间，即本地消息缓存已满
                            || closed // 4. producer 已经关闭
                            || flushInProgress(); // 5. 有线程正在等待 flush 操作完成
                    if (sendable && !backingOff) {
                        // 允许发送消息，且当前为首次发送，或者重试等待时间已经较长，则记录目标 leader 副本所在节点
                        readyNodes.add(leader);
                    } else {
                        long timeLeftMs = Math.max(timeToWaitMs - waitedTimeMs, 0);
                        // Note that this results in a conservative estimate since an un-sendable partition may have
                        // a leader that will later be found to have sendable data. However, this is good enough
                        // since we'll just wake up and then sleep again for the remaining time.
                        // 更新下次执行 ready 判定的时间间隔
                        nextReadyCheckDelayMs = Math.min(timeLeftMs, nextReadyCheckDelayMs);
                    }
                }
            }
        }
    }
    return new ReadyCheckResult(readyNodes, nextReadyCheckDelayMs, unknownLeaderTopics);
}

RecordAccumulator#drain

知道了需要向哪些节点投递消息，接下来自然而然就需要获取发往每个节点的数据，步骤 4 的实现位于 RecordAccumulator#drain 方法中：

public Map<Integer, List<ProducerBatch>> drain(Cluster cluster, Set<Node> nodes, int maxSize, long now) {
    if (nodes.isEmpty())
        return Collections.emptyMap();
    // 记录转换后的结果，key 是目标节点 ID
    Map<Integer, List<ProducerBatch>> batches = new HashMap<>();
    for (Node node : nodes) {
        List<ProducerBatch> ready = drainBatchesForOneNode(cluster, node, maxSize, now);
        batches.put(node.id(), ready);
    }
    return batches;
}

private List<ProducerBatch> drainBatchesForOneNode(Cluster cluster, Node node, int maxSize, long now) {
    int size = 0;
    // 获取当前节点上的分区信息
    List<PartitionInfo> parts = cluster.partitionsForNode(node.id());
    // 记录待发往当前节点的 RecordBatch 集合
    List<ProducerBatch> ready = new ArrayList<>();
    /* to make starvation less likely this loop doesn't start at 0 */
    /*
     * drainIndex 用于记录上次发送停止的位置，本次继续从当前位置开始发送，
     * 如果每次都是从 0 位置开始，可能会导致排在后面的分区饿死，可以看做是一个简单的负载均衡策略
     */
    int start = drainIndex = drainIndex % parts.size();
    do {
        PartitionInfo part = parts.get(drainIndex);
        TopicPartition tp = new TopicPartition(part.topic(), part.partition());

        this.drainIndex = (this.drainIndex + 1) % parts.size();

        // Only proceed if the partition has no in-flight batches.
        // 如果需要保证消息强顺序性，则不应该同时存在多个发往目标分区的消息
        if (isMuted(tp, now))
            continue;

        Deque<ProducerBatch> deque = getDeque(tp);
        if (deque == null)
            continue;

        synchronized (deque) {
            // invariant: !isMuted(tp,now) && deque != null
            // 获取当前分区对应的 RecordBatch 集合
            ProducerBatch first = deque.peekFirst();
            if (first == null)
                continue;

            // first != null
            // 重试 && 重试时间间隔未达到阈值时间
            boolean backoff = first.attempts() > 0 && first.waitedTimeMs(now) < retryBackoffMs;
            // Only drain the batch if it is not during backoff period.
            if (backoff)
                continue;

            // 仅发送第一次发送，或重试等待时间较长的消息
            if (size + first.estimatedSizeInBytes() > maxSize && !ready.isEmpty()) {
                // there is a rare case that a single batch size is larger than the request size due to
                // compression; in this case we will still eventually send this batch in a single request
                // 单次消息数据量已达到上限，结束循环，一般对应一个请求的大小，防止请求消息过大
                break;
            } else {
                if (shouldStopDrainBatchesForPartition(first, tp))
                    break;

                boolean isTransactional = transactionManager != null && transactionManager.isTransactional();
                ProducerIdAndEpoch producerIdAndEpoch =
                    transactionManager != null ? transactionManager.producerIdAndEpoch() : null;
                // 每次仅获取第一个 RecordBatch，并放入 read 列表中，这样给每个分区一个机会，保证公平，防止饥饿
                ProducerBatch batch = deque.pollFirst();
                if (producerIdAndEpoch != null && !batch.hasSequence()) {
                    // If the batch already has an assigned sequence, then we should not change the producer id and
                    // sequence number, since this may introduce duplicates. In particular, the previous attempt
                    // may actually have been accepted, and if we change the producer id and sequence here, this
                    // attempt will also be accepted, causing a duplicate.
                    //
                    // Additionally, we update the next sequence number bound for the partition, and also have
                    // the transaction manager track the batch so as to ensure that sequence ordering is maintained
                    // even if we receive out of order responses.
                    batch.setProducerState(producerIdAndEpoch, transactionManager.sequenceNumber(batch.topicPartition), isTransactional);
                    transactionManager.incrementSequenceNumber(batch.topicPartition, batch.recordCount);
                    log.debug("Assigned producerId {} and producerEpoch {} to batch with base sequence " +
                            "{} being sent to partition {}", producerIdAndEpoch.producerId,
                        producerIdAndEpoch.epoch, batch.baseSequence(), tp);

                    transactionManager.addInFlightBatch(batch);
                }
                // 将当前 RecordBatch 设置为只读
                batch.close();
                size += batch.records().sizeInBytes();
                ready.add(batch);
                // 更新 drainedMs
                batch.drained(now);
            }
        }
    } while (start != drainIndex);
    return ready;
}

Sender#sendProduceRequests

消息发送的过程（步骤 7 ），位于 Sender#sendProduceRequests 方法中：
这一步主要逻辑就是创建客户端请求 ClientRequest 对象，并通过 NetworkClient#send 方法将请求加入到网络 I/O 通道（KafkaChannel）中。同时将该对象缓存到 InFlightRequests 中，等接收到服务端响应时会通过缓存的 ClientRequest 对象调用对应的 callback 方法。最后调用 NetworkClient#poll 方法执行具体的网络请求和响应。

private void sendProduceRequests(Map<Integer, List<ProducerBatch>> collated, long now) {
    // 遍历处理待发送消息集合，key 是目标节点 ID
    for (Map.Entry<Integer, List<ProducerBatch>> entry : collated.entrySet())
        sendProduceRequest(now, entry.getKey(), acks, requestTimeoutMs, entry.getValue());
}

private void sendProduceRequest(long now, int destination, short acks, int timeout, List<ProducerBatch> batches) {
    if (batches.isEmpty())
        return;

    Map<TopicPartition, MemoryRecords> produceRecordsByPartition = new HashMap<>(batches.size());
    final Map<TopicPartition, ProducerBatch> recordsByPartition = new HashMap<>(batches.size());

    // find the minimum magic version used when creating the record sets
    byte minUsedMagic = apiVersions.maxUsableProduceMagic();
    for (ProducerBatch batch : batches) {
        if (batch.magic() < minUsedMagic)
            minUsedMagic = batch.magic();
    }

    // 遍历 RecordBatch 集合，整理成 produceRecordsByPartition 和 recordsByPartition
    for (ProducerBatch batch : batches) {
        TopicPartition tp = batch.topicPartition;
        MemoryRecords records = batch.records();

        // down convert if necessary to the minimum magic used. In general, there can be a delay between the time
        // that the producer starts building the batch and the time that we send the request, and we may have
        // chosen the message format based on out-dated metadata. In the worst case, we optimistically chose to use
        // the new message format, but found that the broker didn't support it, so we need to down-convert on the
        // client before sending. This is intended to handle edge cases around cluster upgrades where brokers may
        // not all support the same message format version. For example, if a partition migrates from a broker
        // which is supporting the new magic version to one which doesn't, then we will need to convert.
        if (!records.hasMatchingMagic(minUsedMagic))
            records = batch.records().downConvert(minUsedMagic, 0, time).records();
        produceRecordsByPartition.put(tp, records);
        recordsByPartition.put(tp, batch);
    }

    String transactionalId = null;
    if (transactionManager != null && transactionManager.isTransactional()) {
        transactionalId = transactionManager.transactionalId();
    }
    // 创建 ProduceRequest 请求构造器
    ProduceRequest.Builder requestBuilder = ProduceRequest.Builder.forMagic(minUsedMagic, acks, timeout,
            produceRecordsByPartition, transactionalId);
    // 创建回调对象，用于处理响应
    RequestCompletionHandler callback = new RequestCompletionHandler() {
        public void onComplete(ClientResponse response) {
            handleProduceResponse(response, recordsByPartition, time.milliseconds());
        }
    };

    String nodeId = Integer.toString(destination);
    // 创建 ClientRequest 请求对象，如果 acks 不等于 0 则表示期望获取服务端响应
    ClientRequest clientRequest = client.newClientRequest(nodeId, requestBuilder, now, acks != 0,
            requestTimeoutMs, callback);
    // 将请求加入到网络 I/O 通道（KafkaChannel）中。同时将该对象缓存到 InFlightRequests 中
    client.send(clientRequest, now);
    log.trace("Sent produce request to {}: {}", nodeId, requestBuilder);
}