下面是一个完整的Kafka巡检脚本,能够生成美观的HTML巡检报告。该脚本检查Kafka集群的关键指标,并以直观的可视化方式展示结果。
#!/bin/bash
# Kafka集群巡检脚本 - 生成HTML报告
# 作者:运维专家
# 版本:1.2
# 配置区域 ==============================================
KAFKA_HOME="/opt/kafka"
ZK_HOSTS="localhost:2181"
BROKERS="localhost:9092"
REPORT_FILE="/tmp/kafka_inspection_$(date +%Y%m%d).html"
LOG_DIR="/var/log/kafka"
# ======================================================
# 检查依赖项
check_dependencies() {
command -v jq >/dev/null 2>&1 || { echo >&2 "jq 未安装,请先安装: sudo apt-get install jq"; exit 1; }
[ -d "$KAFKA_HOME/bin" ] || { echo >&2 "Kafka目录不存在: $KAFKA_HOME"; exit 1; }
}
# 获取Kafka基本信息
get_kafka_info() {
KAFKA_VERSION=$("$KAFKA_HOME/bin/kafka-topics.sh" --version | awk '{print $1}')
BROKER_COUNT=$(echo "$ZK_HOSTS" | awk -F, '{print NF}')
RUNNING_BROKERS=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "ls /brokers/ids" 2>/dev/null | grep '\[.*\]' | jq -r '. | length')
}
# 检查Zookeeper状态
check_zookeeper() {
ZK_STATUS=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "stat" 2>/dev/null | grep -q "Mode: leader" && echo "Healthy" || echo "Unhealthy")
ZK_CONNECTIONS=$(netstat -ant | grep ":2181 " | grep ESTABLISHED | wc -l)
}
# 检查Broker状态
check_brokers() {
BROKER_STATUS=""
while IFS= read -r broker; do
broker_id=${broker%:*}
port=${broker#*:}
status=$(netstat -ant | grep -q ":$port .*ESTABLISHED" && echo "Active" || echo "Down")
BROKER_STATUS+="<li>Broker $broker_id: <span class='$status'>$status</span></li>"
done <<< "$(echo "$BROKERS" | tr ',' '\n')"
}
# 检查Topic状态
check_topics() {
TOPIC_INFO=$("$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --list | \
xargs -I{} "$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --describe --topic {} | \
awk '
/Topic:/ {topic=$2; partitions=0; replicas=0}
/PartitionCount:/ {partitions=$4; replicas=$6}
/UnderReplicated/ {underrep=$2}
END {printf "{\"topic\":\"%s\",\"partitions\":%d,\"replicas\":%d,\"underreplicated\":%d},", topic, partitions, replicas, underrep}
' | sed 's/,$//')
}
# 检查消费者组
check_consumer_groups() {
CONSUMER_GROUPS=$("$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --list | \
xargs -I{} "$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --describe --group {} | \
awk '
/GROUP/ {group=$3; lag=0}
/TOTAL/ {lag=$6}
END {printf "{\"group\":\"%s\",\"lag\":%d},", group, lag}
' | sed 's/,$//')
}
# 检查系统资源
check_system_resources() {
# 文件描述符
FD_USAGE=""
for pid in $(pgrep -f 'kafka\.Kafka'); do
used=$(ls -1 /proc/$pid/fd | wc -l)
limit=$(grep 'Max open files' /proc/$pid/limits | awk '{print $4}')
FD_USAGE+="<li>PID $pid: $used/$limit ($((used*100/limit))%)</li>"
done
# 磁盘空间
DISK_USAGE=$(df -h $LOG_DIR | awk 'NR==2 {print $5}')
}
# 检查错误日志
check_error_logs() {
ERROR_COUNT=$(grep -c -i "error" $LOG_DIR/server.log 2>/dev/null)
WARN_COUNT=$(grep -c -i "warn" $LOG_DIR/server.log 2>/dev/null)
LAST_ERROR=$(grep -i "error" $LOG_DIR/server.log | tail -1 | cut -c 1-100)
}
# 生成HTML报告
generate_html_report() {
cat <<EOF > "$REPORT_FILE"
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Kafka集群巡检报告</title>
<style>
body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; color: #333; max-width: 1200px; margin: 0 auto; padding: 20px; }
.header { text-align: center; padding: 20px; background: #2c3e50; color: white; border-radius: 8px; margin-bottom: 30px; }
.section { margin-bottom: 30px; padding: 20px; border-radius: 8px; background: #f9f9f9; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
h2 { color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 10px; }
.grid-container { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; }
.grid-item { background: white; padding: 15px; border-radius: 5px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
.status-card { text-align: center; padding: 20px; }
.metric-value { font-size: 2.5rem; font-weight: bold; margin: 10px 0; }
.Healthy { color: #27ae60; }
.Unhealthy { color: #e74c3c; }
.Active { color: #27ae60; }
.Down { color: #e74c3c; }
.warning { background: #f39c12; }
.critical { background: #e74c3c; color: white; }
table { width: 100%; border-collapse: collapse; margin: 15px 0; }
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
th { background-color: #3498db; color: white; }
tr:hover { background-color: #f5f5f5; }
.progress-container { height: 20px; background: #ecf0f1; border-radius: 10px; margin: 10px 0; }
.progress-bar { height: 100%; border-radius: 10px; background: #3498db; }
.footer { text-align: center; margin-top: 30px; color: #7f8c8d; font-size: 0.9rem; }
</style>
</head>
<body>
<div class="header">
<h1>Kafka集群巡检报告</h1>
<p>生成时间: $(date "+%Y-%m-%d %H:%M:%S")</p>
</div>
<div class="grid-container">
<div class="grid-item status-card">
<h3>集群状态</h3>
<div class="metric-value $ZK_STATUS">$ZK_STATUS</div>
<p>Zookeeper健康状态</p>
</div>
<div class="grid-item status-card">
<h3>Broker状态</h3>
<div class="metric-value">$RUNNING_BROKERS/$BROKER_COUNT</div>
<p>运行中/总Broker数</p>
</div>
<div class="grid-item status-card">
<h3>错误日志</h3>
<div class="metric-value">$ERROR_COUNT</div>
<p>最近错误数量</p>
</div>
<div class="grid-item status-card">
<h3>磁盘使用</h3>
<div class="metric-value">${DISK_USAGE}</div>
<p>日志目录空间</p>
</div>
</div>
<div class="section">
<h2>Broker详细状态</h2>
<ul>
$BROKER_STATUS
</ul>
</div>
<div class="section">
<h2>Topic状态</h2>
<table>
<thead>
<tr>
<th>Topic名称</th>
<th>分区数</th>
<th>副本数</th>
<th>未同步分区</th>
<th>状态</th>
</tr>
</thead>
<tbody>
$(echo "$TOPIC_INFO" | jq -r '.[] |
"<tr>
<td>\(.topic)</td>
<td>\(.partitions)</td>
<td>\(.replicas)</td>
<td>\(.underreplicated)</td>
<td>\(if .underreplicated > 0 then "<span class=\"warning\">警告</span>" else "<span class=\"Healthy\">正常</span>" end)</td>
</tr>"'
)
</tbody>
</table>
</div>
<div class="section">
<h2>消费者组延迟</h2>
<table>
<thead>
<tr>
<th>消费者组</th>
<th>消息延迟</th>
<th>状态</th>
</tr>
</thead>
<tbody>
$(echo "$CONSUMER_GROUPS" | jq -r '.[] |
"<tr>
<td>\(.group)</td>
<td>\(.lag)</td>
<td>\(if .lag > 1000 then "<span class=\"critical\">严重</span>"
elif .lag > 100 then "<span class=\"warning\">警告</span>"
else "<span class=\"Healthy\">正常</span>" end)</td>
</tr>"'
)
</tbody>
</table>
</div>
<div class="section">
<h2>系统资源使用</h2>
<h3>文件描述符使用</h3>
<ul>
$FD_USAGE
</ul>
<h3>最后一条错误日志</h3>
<div class="warning" style="padding: 15px; border-radius: 5px;">
${LAST_ERROR:-"无错误日志"}
</div>
</div>
<div class="section">
<h2>Zookeeper连接状态</h2>
<p>活跃连接数: $ZK_CONNECTIONS</p>
<div class="progress-container">
<div class="progress-bar" style="width: $((ZK_CONNECTIONS*2))%;"></div>
</div>
</div>
<div class="footer">
<p>Kafka版本: $KAFKA_VERSION | 巡检脚本 v1.2</p>
<p>© 2023 Kafka运维团队 | 生成于 $(hostname)</p>
</div>
</body>
</html>
EOF
}
# 主执行流程
main() {
echo "开始Kafka集群巡检..."
check_dependencies
get_kafka_info
check_zookeeper
check_brokers
check_topics
check_consumer_groups
check_system_resources
check_error_logs
generate_html_report
echo "巡检完成! 报告已生成: $REPORT_FILE"
echo "使用浏览器打开查看:"
echo " firefox $REPORT_FILE &> /dev/null &"
}
main
脚本功能说明
-
全面巡检项目:
- 集群基本信息(版本、Broker数量)
- Zookeeper健康状态
- Broker节点活跃状态
- Topic分区与副本状态
- 消费者组消息延迟
- 文件描述符使用情况
- 磁盘空间占用
- 错误日志分析
-
可视化HTML报告特点:
- 响应式设计适配不同屏幕
- 状态卡片直观展示关键指标
- 颜色编码状态(绿色正常/黄色警告/红色异常)
- 表格展示详细数据
- 进度条可视化资源使用
- 时间戳和主机信息标记
-
使用说明:
# 1. 修改脚本头部配置参数 KAFKA_HOME="/opt/kafka" # Kafka安装目录 ZK_HOSTS="zk1:2181,zk2:2181" # Zookeeper集群地址 BROKERS="broker1:9092,broker2:9092" # Broker列表 LOG_DIR="/var/log/kafka" # Kafka日志目录 # 2. 运行脚本 chmod +x kafka_inspection.sh ./kafka_inspection.sh # 3. 查看报告 firefox /tmp/kafka_inspection_20230624.html -
依赖要求:
- jq (JSON处理器):
sudo apt-get install jq - Kafka命令行工具
- Zookeeper客户端
- jq (JSON处理器):
报告截图示例
[集群状态卡片]
+---------------------------------+
| Zookeeper健康状态 |
| Healthy |
+---------------------------------+
| Broker状态 |
| 3/3 |
+---------------------------------+
| 错误日志 |
| 0 |
+---------------------------------+
| 磁盘使用 |
| 75% |
+---------------------------------+
[详细表格]
Topic名称 分区数 副本数 未同步分区 状态
orders 16 3 0 正常
payments 8 3 2 警告
inventory 32 3 5 严重
扩展建议
-
定时巡检:
# 每天凌晨2点执行巡检 0 2 * * * /path/to/kafka_inspection.sh -
邮件通知:
# 在脚本末尾添加 echo "Kafka巡检报告" | mail -a "$REPORT_FILE" -s "Kafka巡检结果" team@example.com -
历史记录:
# 在generate_html_report函数中添加 cp "$REPORT_FILE" "/var/www/html/kafka-reports/$(basename $REPORT_FILE)" -
增加监控项:
- JVM内存使用情况
- 网络吞吐量监控
- 控制器状态检查
- ISR收缩率分析
该脚本提供了开箱即用的Kafka集群健康检查,生成的HTML报告可直接在浏览器中查看,无需额外工具支持。
1333

被折叠的 条评论
为什么被折叠?



