sendProducerData把实际要发的消息封装好,放入KakfaNetworkClient中。
private long sendProducerData(long now) {
// 1. 计算需要以及可以向哪些节点发送请求
Cluster cluster = metadata.fetch();
// get the list of partitions with data ready to send
// 计算需要向哪些节点发送请求
RecordAccumulator.ReadyCheckResult result = this.accumulator.ready(cluster, now);
// if there are any partitions whose leaders are not known yet, force metadata update
// 2. 如果存在未知的 leader 副本对应的节点(对应的 topic 分区正在执行 leader 选举,或者对应的 topic 已经失效),
// 标记需要更新缓存的集群元数据信息
if (!result.unknownLeaderTopics.isEmpty()) {
// The set of topics with unknown leader contains topics with leader election pending as well as
// topics which may have expired. Add the topic again to metadata to ensure it is included
// and request metadata update, since there are messages to send to the topic.
for (String topic : result.unknownLeaderTopics)
this.metadata.add(topic);
log.debug("Requesting metadata update due to unknown leader topics from the batched records: {}",
result.unknownLeaderTopics);
this.metadata.requestUpdate();
}
// remove any nodes we aren't ready to send to
// 3. 遍历处理待发送请求的目标节点,基于网络 IO 检查对应节点是否可用,对于不可用的节点则剔除
Iterator<Node> iter = result.readyNodes.iterator();
long notReadyTimeout = Long.MAX_VALUE;
while (iter.hasNext()) {
Node node = iter.next();
// 检查目标节点是否准备好接收请求,如果未准备好但目标节点允许创建连接,则创建到目标节点的连接
if (!this.client.ready(node, now)) {
// 对于未准备好的节点,则从 ready 集合中删除
iter.remove();
notReadyTimeout = Math.min(notReadyTimeout, this.client.pollDelayMs(node, now));
}
}
// create produce requests
// 4. 获取每个节点待发送消息集合,其中 key 是目标 leader 副本所在节点 ID
Map<Integer, List<ProducerBatch>> batches = this.accumulator.drain(cluster, result.readyNodes, this.maxRequestSize, now);
addToInflightBatches(batches);
if (guaranteeMessageOrder) {
// 5. 如果需要保证消息的强顺序性,则缓存对应 topic 分区对象,防止同一时间往同一个 topic 分区发送多条处于未完成状态的消息
// Mute all the partitions drained
// 将所有 RecordBatch 的 topic 分区对象加入到 muted 集合中
// 防止同一时间往同一个 topic 分区发送多条处于未完成状态的消息
for (List<ProducerBatch> batchList : batches.values()) {
for (ProducerBatch batch : batchList)
this.accumulator.mutePartition(batch.topicPartition);
}
}
// 6. 处理本地过期的消息,返回 TimeoutException,并释放空间
accumulator.resetNextBatchExpiryTime();
List<ProducerBatch> expiredInflightBatches = getExpiredInflightBatches(now);
List<ProducerBatch> expiredBatches = this.accumulator.expiredBatches(now);
expiredBatches.addAll(expiredInflightBatches);
// Reset the producer id if an expired batch has previously been sent to the broker. Also update the metrics
// for expired batches. see the documentation of @TransactionState.resetProducerId to understand why
// we need to reset the producer id here.
if (!expiredBatches.isEmpty())
log.trace("Expired {} batches in accumulator", expiredBatches.size());
for (ProducerBatch expiredBatch : expiredBatches) {
String errorMessage = "Expiring " + expiredBatch.recordCount + " record(s) for " + expiredBatch.topicPartition
+ ":" + (now - expiredBatch.createdMs) + " ms has passed since batch creation";
failBatch(expiredBatch, -1, NO_TIMESTAMP, new TimeoutException(errorMessage), false);
if (transactionManager != null && expiredBatch.inRetry()) {
// This ensures that no new batches are drained until the current in flight batches are fully resolved.
transactionManager.markSequenceUnresolved(expiredBatch.topicPartition);
}
}
sensors.updateProduceRequestMetrics(batches);
// If we have any nodes that are ready to send + have sendable data, poll with 0 timeout so this can immediately
// loop and try sending more data. Otherwise, the timeout will be the smaller value between next batch expiry
// time, and the delay time for checking data availability. Note that the nodes may have data that isn't yet
// sendable due to lingering, backing off, etc. This specifically does not include nodes with sendable data
// that aren't ready to send since they would cause busy looping.
// 如果存在待发送的消息,则设置 pollTimeout 等于 0,这样可以立即发送请求,从而能够缩短剩余消息的缓存时间,避免堆积
long pollTimeout = Math.min(result.nextReadyCheckDelayMs, notReadyTimeout);
pollTimeout = Math.min(pollTimeout, this.accumulator.nextExpiryTimeMs() - now);
pollTimeout = Math.max(pollTimeout, 0);
if (!result.readyNodes.isEmpty()) {
log.trace("Nodes with data ready to send: {}", result.readyNodes);
// if some partitions are already ready to be sent, the select time would be 0;
// otherwise if some partition already has some data accumulated but not ready yet,
// the select time will be the time difference between now and its linger expiry time;
// otherwise the select time will be the time difference between now and the metadata expiry time;
pollTimeout = 0;
}
// 7. 发送请求到服务端,并处理服务端响应
sendProduceRequests(batches, now);
return pollTimeout;
}
RecordAccumulator#ready
步骤 1 ,该步骤用于计算需要向哪些节点投递消息在 RecordAccumulator#ready 方法中:
public ReadyCheckResult ready(Cluster cluster, long nowMs) {
// 用于记录接收请求的节点
Set<Node> readyNodes = new HashSet<>();
// 记录下次执行 ready 判断的时间间隔
long nextReadyCheckDelayMs = Long.MAX_VALUE;
// 记录找不到 leader 副本的分区对应的 topic 集合
Set<String> unknownLeaderTopics = new HashSet<>();
// 是否有线程在等待 BufferPool 分配空间
boolean exhausted = this.free.queued() > 0;
// 遍历每个 topic 分区及其 RecordBatch 队列,对每个分区的 leader 副本所在的节点执行判定
for (Map.Entry<TopicPartition, Deque<ProducerBatch>> entry : this.batches.entrySet()) {
Deque<ProducerBatch> deque = entry.getValue();
synchronized (deque) {
// When producing to a large number of partitions, this path is hot and deques are often empty.
// We check whether a batch exists first to avoid the more expensive checks whenever possible.
ProducerBatch batch = deque.peekFirst();
if (batch != null) {
TopicPartition part = entry.getKey();
// 获取当前 topic 分区 leader 副本所在的节点
Node leader = cluster.leaderFor(part);
// 当前分区 leader 副本未知,但存在发往该分区的消息
if (leader == null) {
// This is a partition for which leader is not known, but messages are available to send.
// Note that entries are currently not removed from batches when deque is empty.
unknownLeaderTopics.add(part.topic());
} else if (!readyNodes.contains(leader) && !isMuted(part, nowMs)) {
// 如果需要保证消息顺序性,则不应该存在多个发往该 leader 副本节点且未完成的消息
long waitedTimeMs = batch.waitedTimeMs(nowMs);
// 当前为重试操作,且重试时间间隔未达到阈值时间
boolean backingOff = batch.attempts() > 0 && waitedTimeMs < retryBackoffMs;
long timeToWaitMs = backingOff ? retryBackoffMs : lingerMs;
boolean full = deque.size() > 1 || batch.isFull();
boolean expired = waitedTimeMs >= timeToWaitMs;
// 标记当前节点是否可以接收请求,如果满足其中一个则认为需要往目标节点投递消息:
boolean sendable = full // 1. 队列中有多个 RecordBatch,或第一个 RecordBatch 已满
|| expired // 2. 当前等待重试的时间过长
|| exhausted // 3. 有其他线程在等待 BufferPool 分配空间,即本地消息缓存已满
|| closed // 4. producer 已经关闭
|| flushInProgress(); // 5. 有线程正在等待 flush 操作完成
if (sendable && !backingOff) {
// 允许发送消息,且当前为首次发送,或者重试等待时间已经较长,则记录目标 leader 副本所在节点
readyNodes.add(leader);
} else {
long timeLeftMs = Math.max(timeToWaitMs - waitedTimeMs, 0);
// Note that this results in a conservative estimate since an un-sendable partition may have
// a leader that will later be found to have sendable data. However, this is good enough
// since we'll just wake up and then sleep again for the remaining time.
// 更新下次执行 ready 判定的时间间隔
nextReadyCheckDelayMs = Math.min(timeLeftMs, nextReadyCheckDelayMs);
}
}
}
}
}
return new ReadyCheckResult(readyNodes, nextReadyCheckDelayMs, unknownLeaderTopics);
}
RecordAccumulator#drain
知道了需要向哪些节点投递消息,接下来自然而然就需要获取发往每个节点的数据, 步骤 4 的实现位于 RecordAccumulator#drain 方法中:
public Map<Integer, List<ProducerBatch>> drain(Cluster cluster, Set<Node> nodes, int maxSize, long now) {
if (nodes.isEmpty())
return Collections.emptyMap();
// 记录转换后的结果,key 是目标节点 ID
Map<Integer, List<ProducerBatch>> batches = new HashMap<>();
for (Node node : nodes) {
List<ProducerBatch> ready = drainBatchesForOneNode(cluster, node, maxSize, now);
batches.put(node.id(), ready);
}
return batches;
}
private List<ProducerBatch> drainBatchesForOneNode(Cluster cluster, Node node, int maxSize, long now) {
int size = 0;
// 获取当前节点上的分区信息
List<PartitionInfo> parts = cluster.partitionsForNode(node.id());
// 记录待发往当前节点的 RecordBatch 集合
List<ProducerBatch> ready = new ArrayList<>();
/* to make starvation less likely this loop doesn't start at 0 */
/*
* drainIndex 用于记录上次发送停止的位置,本次继续从当前位置开始发送,
* 如果每次都是从 0 位置开始,可能会导致排在后面的分区饿死,可以看做是一个简单的负载均衡策略
*/
int start = drainIndex = drainIndex % parts.size();
do {
PartitionInfo part = parts.get(drainIndex);
TopicPartition tp = new TopicPartition(part.topic(), part.partition());
this.drainIndex = (this.drainIndex + 1) % parts.size();
// Only proceed if the partition has no in-flight batches.
// 如果需要保证消息强顺序性,则不应该同时存在多个发往目标分区的消息
if (isMuted(tp, now))
continue;
Deque<ProducerBatch> deque = getDeque(tp);
if (deque == null)
continue;
synchronized (deque) {
// invariant: !isMuted(tp,now) && deque != null
// 获取当前分区对应的 RecordBatch 集合
ProducerBatch first = deque.peekFirst();
if (first == null)
continue;
// first != null
// 重试 && 重试时间间隔未达到阈值时间
boolean backoff = first.attempts() > 0 && first.waitedTimeMs(now) < retryBackoffMs;
// Only drain the batch if it is not during backoff period.
if (backoff)
continue;
// 仅发送第一次发送,或重试等待时间较长的消息
if (size + first.estimatedSizeInBytes() > maxSize && !ready.isEmpty()) {
// there is a rare case that a single batch size is larger than the request size due to
// compression; in this case we will still eventually send this batch in a single request
// 单次消息数据量已达到上限,结束循环,一般对应一个请求的大小,防止请求消息过大
break;
} else {
if (shouldStopDrainBatchesForPartition(first, tp))
break;
boolean isTransactional = transactionManager != null && transactionManager.isTransactional();
ProducerIdAndEpoch producerIdAndEpoch =
transactionManager != null ? transactionManager.producerIdAndEpoch() : null;
// 每次仅获取第一个 RecordBatch,并放入 read 列表中,这样给每个分区一个机会,保证公平,防止饥饿
ProducerBatch batch = deque.pollFirst();
if (producerIdAndEpoch != null && !batch.hasSequence()) {
// If the batch already has an assigned sequence, then we should not change the producer id and
// sequence number, since this may introduce duplicates. In particular, the previous attempt
// may actually have been accepted, and if we change the producer id and sequence here, this
// attempt will also be accepted, causing a duplicate.
//
// Additionally, we update the next sequence number bound for the partition, and also have
// the transaction manager track the batch so as to ensure that sequence ordering is maintained
// even if we receive out of order responses.
batch.setProducerState(producerIdAndEpoch, transactionManager.sequenceNumber(batch.topicPartition), isTransactional);
transactionManager.incrementSequenceNumber(batch.topicPartition, batch.recordCount);
log.debug("Assigned producerId {} and producerEpoch {} to batch with base sequence " +
"{} being sent to partition {}", producerIdAndEpoch.producerId,
producerIdAndEpoch.epoch, batch.baseSequence(), tp);
transactionManager.addInFlightBatch(batch);
}
// 将当前 RecordBatch 设置为只读
batch.close();
size += batch.records().sizeInBytes();
ready.add(batch);
// 更新 drainedMs
batch.drained(now);
}
}
} while (start != drainIndex);
return ready;
}
Sender#sendProduceRequests
消息发送的过程( 步骤 7 ),位于 Sender#sendProduceRequests 方法中:
这一步主要逻辑就是创建客户端请求 ClientRequest 对象,并通过 NetworkClient#send 方法将请求加入到网络 I/O 通道(KafkaChannel)中。同时将该对象缓存到 InFlightRequests 中,等接收到服务端响应时会通过缓存的 ClientRequest 对象调用对应的 callback 方法。最后调用 NetworkClient#poll 方法执行具体的网络请求和响应。
private void sendProduceRequests(Map<Integer, List<ProducerBatch>> collated, long now) {
// 遍历处理待发送消息集合,key 是目标节点 ID
for (Map.Entry<Integer, List<ProducerBatch>> entry : collated.entrySet())
sendProduceRequest(now, entry.getKey(), acks, requestTimeoutMs, entry.getValue());
}
private void sendProduceRequest(long now, int destination, short acks, int timeout, List<ProducerBatch> batches) {
if (batches.isEmpty())
return;
Map<TopicPartition, MemoryRecords> produceRecordsByPartition = new HashMap<>(batches.size());
final Map<TopicPartition, ProducerBatch> recordsByPartition = new HashMap<>(batches.size());
// find the minimum magic version used when creating the record sets
byte minUsedMagic = apiVersions.maxUsableProduceMagic();
for (ProducerBatch batch : batches) {
if (batch.magic() < minUsedMagic)
minUsedMagic = batch.magic();
}
// 遍历 RecordBatch 集合,整理成 produceRecordsByPartition 和 recordsByPartition
for (ProducerBatch batch : batches) {
TopicPartition tp = batch.topicPartition;
MemoryRecords records = batch.records();
// down convert if necessary to the minimum magic used. In general, there can be a delay between the time
// that the producer starts building the batch and the time that we send the request, and we may have
// chosen the message format based on out-dated metadata. In the worst case, we optimistically chose to use
// the new message format, but found that the broker didn't support it, so we need to down-convert on the
// client before sending. This is intended to handle edge cases around cluster upgrades where brokers may
// not all support the same message format version. For example, if a partition migrates from a broker
// which is supporting the new magic version to one which doesn't, then we will need to convert.
if (!records.hasMatchingMagic(minUsedMagic))
records = batch.records().downConvert(minUsedMagic, 0, time).records();
produceRecordsByPartition.put(tp, records);
recordsByPartition.put(tp, batch);
}
String transactionalId = null;
if (transactionManager != null && transactionManager.isTransactional()) {
transactionalId = transactionManager.transactionalId();
}
// 创建 ProduceRequest 请求构造器
ProduceRequest.Builder requestBuilder = ProduceRequest.Builder.forMagic(minUsedMagic, acks, timeout,
produceRecordsByPartition, transactionalId);
// 创建回调对象,用于处理响应
RequestCompletionHandler callback = new RequestCompletionHandler() {
public void onComplete(ClientResponse response) {
handleProduceResponse(response, recordsByPartition, time.milliseconds());
}
};
String nodeId = Integer.toString(destination);
// 创建 ClientRequest 请求对象,如果 acks 不等于 0 则表示期望获取服务端响应
ClientRequest clientRequest = client.newClientRequest(nodeId, requestBuilder, now, acks != 0,
requestTimeoutMs, callback);
// 将请求加入到网络 I/O 通道(KafkaChannel)中。同时将该对象缓存到 InFlightRequests 中
client.send(clientRequest, now);
log.trace("Sent produce request to {}: {}", nodeId, requestBuilder);
}
412

被折叠的 条评论
为什么被折叠?



