之前介绍了按照时间空间老化消息的定时任务,本篇来看一下 LogCleaner 线程,如果在配置中指定了 log.cleaner.enable=true,那么在 LogManager#startup 方法的最后会调用 LogCleaner#startup 方法启动 LogCleaner 线程对日志数据执行清理工作。本篇针对配置了 cleanup.policy=compact 策略的清理方式,kafka 的 LogCleaner 线程就会对具备相同 key 的消息进行清理操作,仅保留当前具备最大 offset 的 key 的消息。
从 LogCleaner#startup 方法开始,startup启动注册在 LogCleaner 中的 CleanerThread 线程集合。CleanerThread的doWork方法的核心是tryCleanFilthiestLog,选取一个最需要被清理的 LogSegment 区间,并执行清理工作。
private def cleanFilthiestLog(): Boolean = {
val preCleanStats = new PreCleanStats()
// 选取下一个最需要进行日志清理的 LogToClean 对象
val cleaned = cleanerManager.grabFilthiestCompactedLog(time, preCleanStats) match {
case None =>
// 没有需要被清理的 LogToClean 对象,休息一会后继续尝试
false
case Some(cleanable) =>
// there's a log, clean it
this.lastPreCleanStats = preCleanStats
try {
// 调用 cleanLog 方法执行清理工作
cleanLog(cleanable)
true
} catch {
case e @ (_: ThreadShutdownException | _: ControlThrowable) => throw e
case e: Exception => throw new LogCleaningException(cleanable.log, e.getMessage, e)
}
}
// 获取所有启用了 compact 和 delete 清理策略的 Log 对象,并将其对应的 topic 分区状态设置为 LogCleaningInProgress
val deletable: Iterable[(TopicPartition, Log)] = cleanerManager.deletableLogs()
try {
deletable.foreach { case (_, log) =>
try {
// 对设置了清理策略为 delete 的 LogSegment 执行删除操作,删除过期或过大的 LogSegment 对象。
// 设置了清理策略为 delete 这个在deleteOldSegments会判断。
log.deleteOldSegments()
} catch {
case e @ (_: ThreadShutdownException | _: ControlThrowable) => throw e
case e: Exception => throw new LogCleaningException(log, e.getMessage, e)
}
}
} finally {
// 移除这些 topic 分区对应的 LogCleaningInProgress 状态
cleanerManager.doneDeleting(deletable.map(_._1))
}
cleaned
}
其中grabFilthiestCompactedLog 方法选取下一个最需要被清理的 LogToClean 对象,然后调用 Cleaner#clean 依据该对象执行清理操作
def grabFilthiestCompactedLog(time: Time, preCleanStats: PreCleanStats = new PreCleanStats()): Option[LogToClean] = {
inLock(lock) {
val now = time.milliseconds
this.timeOfLastRun = now
// 读取 log 目录下的 cleaner-offset-checkpoint 文件,获取每个 topic 分区上次清理操作的 offset 边界
val lastClean = allCleanerCheckpoints
val dirtyLogs = logs.filter {
// 过滤掉 cleanup.policy 配置为 delete 的 Log 对象,因为不需要压缩
case (_, log) => log.config.compact // match logs that are marked as compacted
}.filterNot {
case (topicPartition, log) =>
// skip any logs already in-progress and uncleanable partitions
// 过滤掉所有正在执行清理工作的 Log 对象
inProgress.contains(topicPartition) || isUncleanablePartition(log, topicPartition)
}.map {
// 将需要被清理的区间封装成 LogToClean 对象
case (topicPartition, log) => // create a LogToClean instance for each
try {
val lastCleanOffset = lastClean.get(topicPartition)
// 计算需要执行清理操作的 offset 区间
val offsetsToClean = cleanableOffsets(log, lastCleanOffset, now)
// update checkpoint for logs with invalid checkpointed offsets
if (offsetsToClean.forceUpdateCheckpoint)
updateCheckpoints(log.dir.getParentFile(), Option(topicPartition, offsetsToClean.firstDirtyOffset))
val compactionDelayMs = maxCompactionDelay(log, offsetsToClean.firstDirtyOffset, now)
preCleanStats.updateMaxCompactionDelay(compactionDelayMs)
// 构建清理区间对应的 LogToClean 对象
LogToClean(topicPartition, log, offsetsToClean.firstDirtyOffset, offsetsToClean.firstUncleanableDirtyOffset, compactionDelayMs > 0)
} catch {
case e: Throwable => throw new LogCleaningException(log,
s"Failed to calculate log cleaning stats for partition $topicPartition", e)
}
}.filter(ltc => ltc.totalBytes > 0) // skip any empty logs
// 获取待清理区间最大的 cleanableRatio 比率
this.dirtiestLogCleanableRatio = if (dirtyLogs.nonEmpty) dirtyLogs.max.cleanableRatio else 0
// and must meet the minimum threshold for dirty byte ratio or have some bytes required to be compacted
// 过滤掉所有 cleanableRatio 小于等于配置值(对应 min.cleanable.dirty.ratio 配置)的 LogToClean 对象
// min.cleanable.dirty.ratio清理日志的最小压缩率,50%,即又50%以上的dirty data才会清理
val cleanableLogs = dirtyLogs.filter { ltc =>
(ltc.needCompactionNow && ltc.cleanableBytes > 0) || ltc.cleanableRatio > ltc.log.config.minCleanableRatio
}
if(cleanableLogs.isEmpty) {
None
} else {
preCleanStats.recordCleanablePartitions(cleanableLogs.size)
// 基于需要清理的数据占比选择最需要执行清理的 LogToClean 对象
val filthiest = cleanableLogs.max
// 更新对应 topic 分区的清理状态为 LogCleaningInProgress
inProgress.put(filthiest.topicPartition, LogCleaningInProgress)
Some(filthiest)
}
}
}
计算待清理区间的过程由 LogCleanerManager#cleanableOffsets 方法实现,区间值包括 dirty 部分的起始 offset 值和 uncleanable LogSegment 对象的 baseOffset 值。方法实现如下:
def cleanableOffsets(log: Lo