V2.0
#!/bin/bash
# Kafka集群深度巡检脚本 - TXT报告版 (修复版)
# 作者:高级运维工程师
# 版本:3.0
# 配置区域 ==============================================
KAFKA_HOME="/opt/kafka"
ZK_HOSTS="localhost:2181"
BROKERS="localhost:9092"
REPORT_FILE="/tmp/kafka_inspection_$(date +%Y%m%d).txt"
LOG_DIR="/var/log/kafka"
KAFKA_USER="kafka" # 运行Kafka的系统用户
# 阈值配置
FD_WARN_PERCENT=80
DISK_WARN_PERCENT=85
CPU_WARN_PERCENT=75
MEM_WARN_PERCENT=80
UNDER_REPLICATED_WARN=3
CONSUMER_LAG_WARN=1000
CONSUMER_LAG_CRITICAL=10000
# ======================================================
# 工具函数
print_header() {
echo "=============================================================="
echo " $1"
echo "=============================================================="
}
print_section() {
echo "--------------------------------------------------------------"
echo "### $1"
echo "--------------------------------------------------------------"
}
print_key_value() {
printf "%-35s: %s\n" "$1" "$2"
}
print_table_header() {
local header=$1
local divider=$(printf '%*s' ${#header} | tr ' ' '-')
echo "$header"
echo "$divider"
}
# 检查依赖项
check_dependencies() {
local missing=0
command -v jq >/dev/null 2>&1 || { echo "错误: jq 未安装"; missing=1; }
command -v nc >/dev/null 2>&1 || { echo "警告: nc 不可用,部分端口检查将跳过"; }
[ -d "$KAFKA_HOME/bin" ] || { echo "错误: Kafka目录不存在: $KAFKA_HOME"; exit 1; }
[ $missing -eq 1 ] && exit 1
}
# 获取Kafka PID
get_kafka_pid() {
# 多种方式获取PID
KAFKA_PID=$(pgrep -f 'kafka\.Kafka' | head -1)
if [ -z "$KAFKA_PID" ]; then
# 尝试通过进程名获取
KAFKA_PID=$(ps aux | grep '[k]afka.Kafka' | awk '{print $2}' | head -1)
fi
if [ -z "$KAFKA_PID" ]; then
echo "错误: 无法找到Kafka进程"
return 1
fi
return 0
}
# 获取Kafka基本信息
get_kafka_info() {
if ! get_kafka_pid; then
KAFKA_VERSION="未知 (进程未运行)"
START_TIME="N/A"
UPTIME="N/A"
return
fi
KAFKA_VERSION=$("$KAFKA_HOME/bin/kafka-topics.sh" --version 2>/dev/null | awk '{print $1}')
[ -z "$KAFKA_VERSION" ] && KAFKA_VERSION=$("$KAFKA_HOME/bin/kafka-run-class.sh" kafka.Kafka --version 2>/dev/null)
[ -z "$KAFKA_VERSION" ] && KAFKA_VERSION="未知"
# 获取启动时间和运行时长
if [ -f "/proc/$KAFKA_PID" ]; then
START_TIME=$(date -d "@$(stat -c %Y /proc/$KAFKA_PID)" "+%Y-%m-%d %H:%M:%S")
START_SECONDS=$(stat -c %Y /proc/$KAFKA_PID)
NOW_SECONDS=$(date +%s)
UPTIME_SECONDS=$((NOW_SECONDS - START_SECONDS))
UPTIME_DAYS=$((UPTIME_SECONDS / 86400))
UPTIME_HOURS=$(( (UPTIME_SECONDS % 86400) / 3600 ))
UPTIME_MINUTES=$(( (UPTIME_SECONDS % 3600) / 60 ))
UPTIME="${UPTIME_DAYS}天${UPTIME_HOURS}小时${UPTIME_MINUTES}分"
else
START_TIME="未知"
UPTIME="未知"
fi
BROKER_COUNT=$(echo "$BROKERS" | tr ',' '\n' | wc -l)
RUNNING_BROKERS=0
}
# 检查Zookeeper状态
check_zookeeper() {
ZK_MODE="未知"
ZK_CONNECTIONS=0
ZK_LATENCY="未知"
ZK_VERSION="未知"
# 检查Zookeeper连接
if command -v nc &> /dev/null; then
for zk in $(echo "$ZK_HOSTS" | tr ',' '\n'); do
host=${zk%:*}
port=${zk#*:}
if nc -z -w2 $host $port; then
RUNNING_BROKERS=$((RUNNING_BROKERS+1))
fi
done
fi
# 获取Zookeeper详细信息
if [ -x "$KAFKA_HOME/bin/zookeeper-shell.sh" ]; then
ZK_MODE=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "stat" 2>/dev/null | grep "Mode:" | cut -d' ' -f2)
ZK_LATENCY=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "stat" 2>/dev/null | grep "Latency min/avg/max" | cut -d':' -f2)
ZK_VERSION=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "stat" 2>/dev/null | grep "Zookeeper version" | cut -d':' -f2)
fi
}
# 检查Broker状态
check_brokers() {
BROKER_STATUS=""
while IFS= read -r broker; do
host=${broker%:*}
port=${broker#*:}
# 检查端口连通性
status="Down"
if command -v nc &> /dev/null; then
if nc -z -w2 $host $port; then
status="Active"
fi
fi
# 尝试获取Broker ID
broker_id="未知"
if [ -x "$KAFKA_HOME/bin/zookeeper-shell.sh" ]; then
broker_id=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "get /brokers/ids" 2>/dev/null |
grep -A1 "$host" | grep '"host"' | jq -r '.id' 2>/dev/null)
fi
# 尝试获取Broker角色
role="未知"
if [ "$broker_id" != "未知" ]; then
controller=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "get /controller" 2>/dev/null |
grep 'brokerid' | jq -r '.brokerid' 2>/dev/null)
[[ "$controller" == "$broker_id" ]] && role="Controller" || role="Follower"
fi
BROKER_STATUS+=$(printf "%-15s | %-10s | %-10s | %-15s | %s\n" "$broker" "$broker_id" "$role" "$status" "$(date)")
done <<< "$(echo "$BROKERS" | tr ',' '\n')"
}
# 检查Topic状态
check_topics() {
TOPIC_INFO=""
if [ -x "$KAFKA_HOME/bin/kafka-topics.sh" ]; then
TOPIC_INFO=$("$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --list 2>/dev/null | \
xargs -I{} "$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --describe --topic {} 2>/dev/null | \
awk '
/Topic:/ {topic=$2; partitions=0; replicas=0}
/PartitionCount:/ {partitions=$4; replicas=$6}
/UnderReplicated/ {underrep=$2}
END {printf "%s|%d|%d|%d\n", topic, partitions, replicas, underrep}
')
fi
}
# 检查消费者组
check_consumer_groups() {
CONSUMER_GROUPS=""
if [ -x "$KAFKA_HOME/bin/kafka-consumer-groups.sh" ]; then
CONSUMER_GROUPS=$("$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --list 2>/dev/null | \
xargs -I{} "$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --describe --group {} 2>/dev/null | \
awk '
/GROUP/ {group=$3; lag=0; state=""}
/TOTAL/ {lag=$6; state=$8}
END {printf "%s|%d|%s\n", group, lag, state}
')
fi
}
# 检查系统资源
check_system_resources() {
# 文件描述符
FD_INFO=""
if get_kafka_pid; then
for pid in $KAFKA_PID; do
if [ -d "/proc/$pid/fd" ]; then
used=$(ls -1 /proc/$pid/fd 2>/dev/null | wc -l)
limit=$(grep 'Max open files' /proc/$pid/limits 2>/dev/null | awk '{print $4}')
if [ -n "$limit" ] && [ "$limit" != "unlimited" ] && [ $limit -gt 0 ]; then
percent=$((used*100/limit))
status="OK"
[ $percent -gt $FD_WARN_PERCENT ] && status="WARNING"
FD_INFO+=$(printf "%-8s | %-8s | %-3d%% | %-10s\n" "$pid" "$used/$limit" "$percent" "$status")
else
FD_INFO+=$(printf "%-8s | %-8s | %-3s | %-10s\n" "$pid" "$used/?" "?" "UNKNOWN")
fi
fi
done
fi
# 磁盘空间
DISK_INFO=""
if [ -d "$LOG_DIR" ]; then
DISK_INFO=$(df -h $LOG_DIR | awk 'NR==2')
disk_usage=$(echo $DISK_INFO | awk '{print $5}' | tr -d '%')
status="OK"
[ $disk_usage -gt $DISK_WARN_PERCENT ] && status="WARNING"
DISK_STATUS=$(printf "%-20s | %-8s | %-8s | %-8s | %-3d%% | %s" \
$(echo $DISK_INFO | awk '{print $6}') \
$(echo $DISK_INFO | awk '{print $2}') \
$(echo $DISK_INFO | awk '{print $3}') \
$(echo $DISK_INFO | awk '{print $4}') \
$disk_usage \
$status)
else
DISK_STATUS=$(printf "%-20s | %-8s | %-8s | %-8s | %-3s | %s" \
"$LOG_DIR" "N/A" "N/A" "N/A" "N/A" "目录不存在")
fi
# CPU和内存
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
MEM_USAGE=$(free | awk '/Mem/{printf "%.0f", $3/$2*100}')
cpu_status="OK"
mem_status="OK"
[[ $CPU_USAGE > $CPU_WARN_PERCENT ]] && cpu_status="WARNING"
[[ $MEM_USAGE > $MEM_WARN_PERCENT ]] && mem_status="WARNING"
CPU_STATUS=$(printf "%-6.1f%% | %s" $CPU_USAGE $cpu_status)
MEM_STATUS=$(printf "%-6d%% | %s" $MEM_USAGE $mem_status)
}
# 检查错误日志
check_error_logs() {
ERROR_COUNT=0
WARN_COUNT=0
LAST_3_ERRORS="无错误日志"
if [ -f "$LOG_DIR/server.log" ]; then
ERROR_COUNT=$(grep -c -i "error" $LOG_DIR/server.log 2>/dev/null || echo 0)
WARN_COUNT=$(grep -c -i "warn" $LOG_DIR/server.log 2>/dev/null || echo 0)
LAST_3_ERRORS=$(grep -i "error" $LOG_DIR/server.log | tail -3 | sed 's/.*ERROR/ERROR:/' | sed 's/^/ /')
[ -z "$LAST_3_ERRORS" ] && LAST_3_ERRORS="无错误日志"
elif [ -f "$LOG_DIR/kafka.log" ]; then
ERROR_COUNT=$(grep -c -i "error" $LOG_DIR/kafka.log 2>/dev/null || echo 0)
WARN_COUNT=$(grep -c -i "warn" $LOG_DIR/kafka.log 2>/dev/null || echo 0)
LAST_3_ERRORS=$(grep -i "error" $LOG_DIR/kafka.log | tail -3 | sed 's/.*ERROR/ERROR:/' | sed 's/^/ /')
[ -z "$LAST_3_ERRORS" ] && LAST_3_ERRORS="无错误日志"
else
LAST_3_ERRORS="日志文件未找到"
fi
}
# 检查网络连接
check_network() {
LISTEN_PORTS=""
ESTABLISHED_CONNS=0
CONN_BY_IP=""
if get_kafka_pid; then
LISTEN_PORTS=$(ss -tuln -p | grep "pid=$KAFKA_PID" | awk '{print $5}' | sort | uniq)
ESTABLISHED_CONNS=$(ss -tanp | grep "pid=$KAFKA_PID" | grep ESTAB | wc -l)
CONN_BY_IP=$(ss -tanp | grep "pid=$KAFKA_PID" | grep ':9092' | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -5)
fi
}
# 检查JVM状态
check_jvm() {
JVM_VERSION="jcmd不可用"
JVM_FLAGS="jcmd不可用"
GC_INFO="jstat不可用"
if get_kafka_pid; then
# 尝试找到JAVA_HOME
if [ -z "$JAVA_HOME" ]; then
java_path=$(readlink -f /proc/$KAFKA_PID/exe 2>/dev/null)
if [[ "$java_path" =~ "/bin/java" ]]; then
JAVA_HOME=${java_path%/bin/java}
fi
fi
# 使用jcmd
if [ -n "$JAVA_HOME" ] && [ -x "$JAVA_HOME/bin/jcmd" ]; then
JVM_VERSION=$("$JAVA_HOME/bin/jcmd" $KAFKA_PID VM.version 2>/dev/null | grep version || echo "未知")
JVM_FLAGS=$("$JAVA_HOME/bin/jcmd" $KAFKA_PID VM.flags 2>/dev/null | head -5)
GC_INFO=$("$JAVA_HOME/bin/jstat" -gc $KAFKA_PID 2>/dev/null | tail -1 || echo "未知")
fi
fi
}
# 检查配置差异
check_configs() {
ACTUAL_CONFIG=""
DEFAULT_CONFIG=""
if [ -x "$KAFKA_HOME/bin/kafka-configs.sh" ]; then
ACTUAL_CONFIG=$("$KAFKA_HOME/bin/kafka-configs.sh" --bootstrap-server $BROKERS --entity-type brokers --describe 2>&1)
if [[ "$ACTUAL_CONFIG" == *"AuthorizationException"* ]]; then
ACTUAL_CONFIG="错误: 需要管理员权限执行此操作"
fi
else
ACTUAL_CONFIG="错误: kafka-configs.sh 未找到"
fi
if [ -f "$KAFKA_HOME/config/server.properties" ]; then
DEFAULT_CONFIG=$(grep -vE '^#|^$' "$KAFKA_HOME/config/server.properties" | head -10)
else
DEFAULT_CONFIG="错误: server.properties 未找到"
fi
}
# 生成报告
generate_report() {
{
echo "Kafka集群深度巡检报告"
echo "生成时间: $(date "+%Y-%m-%d %H:%M:%S")"
echo "主机名: $(hostname)"
echo "=============================================================="
print_header "一、集群概览"
print_key_value "Kafka版本" "$KAFKA_VERSION"
print_key_value "运行时长" "$UPTIME"
print_key_value "启动时间" "$START_TIME"
print_key_value "Broker总数/在线数" "$BROKER_COUNT/$RUNNING_BROKERS"
print_key_value "Zookeeper模式" "$ZK_MODE"
print_key_value "Zookeeper版本" "$ZK_VERSION"
echo
print_header "二、Broker详细状态"
print_table_header "Broker地址 | Broker ID | 角色 | 状态 | 检查时间"
echo "$BROKER_STATUS"
echo
print_header "三、Topic状态"
if [ -n "$TOPIC_INFO" ]; then
print_table_header "Topic名称 | 分区数 | 副本数 | 未同步分区 | 状态"
echo "$TOPIC_INFO" | awk -F'|' '{
status = ($4 > '$UNDER_REPLICATED_WARN') ? "WARNING" : "OK";
printf "%-20s | %-7s | %-7s | %-12s | %s\n", $1, $2, $3, $4, status
}' | sort
else
echo "无法获取Topic信息"
fi
echo
print_header "四、消费者组状态"
if [ -n "$CONSUMER_GROUPS" ]; then
print_table_header "消费者组名称 | 消息延迟 | 状态"
echo "$CONSUMER_GROUPS" | awk -F'|' '{
status = "OK";
if ($2 > '$CONSUMER_LAG_CRITICAL') status = "CRITICAL";
else if ($2 > '$CONSUMER_LAG_WARN') status = "WARNING";
printf "%-20s | %-10s | %-10s | %s\n", $1, $2, $3, status
}' | sort
else
echo "无法获取消费者组信息"
fi
echo
print_header "五、系统资源"
print_section "1. 文件描述符使用"
if [ -n "$FD_INFO" ]; then
print_table_header "PID | 使用量 | 使用率 | 状态"
echo "$FD_INFO"
else
echo "无法获取文件描述符信息"
fi
echo
print_section "2. 磁盘空间"
echo "$DISK_STATUS"
echo
print_section "3. CPU和内存"
print_table_header "CPU使用率 | 状态"
echo "$CPU_STATUS"
print_table_header "内存使用率 | 状态"
echo "$MEM_STATUS"
echo
print_section "4. Zookeeper状态"
print_key_value "活跃连接数" "$ZK_CONNECTIONS"
print_key_value "延迟(最小/平均/最大)" "$ZK_LATENCY"
echo
print_header "六、日志分析"
print_key_value "ERROR级别日志数量" "$ERROR_COUNT"
print_key_value "WARN级别日志数量" "$WARN_COUNT"
echo
print_section "最近3条ERROR日志"
echo "$LAST_3_ERRORS"
echo
print_header "七、网络连接"
print_key_value "已建立连接数" "$ESTABLISHED_CONNS"
print_section "监听端口"
echo "$LISTEN_PORTS"
echo
print_section "TOP 5客户端IP连接数"
[ -n "$CONN_BY_IP" ] && echo "$CONN_BY_IP" || echo "无连接信息"
echo
print_header "八、JVM状态"
print_key_value "JVM版本" "$JVM_VERSION"
print_section "GC统计"
echo "$GC_INFO"
echo
print_section "JVM参数"
echo "$JVM_FLAGS" | head -5
echo "... (完整参数请查看日志)"
echo
print_header "九、配置检查"
print_section "重要配置项"
if [ -f "$KAFKA_HOME/config/server.properties" ]; then
grep -E 'log.retention|replication|compression|cleanup.policy' "$KAFKA_HOME/config/server.properties" | head -10
else
echo "无法读取server.properties"
fi
echo
print_section "运行时配置差异"
echo "$ACTUAL_CONFIG" | head -10
echo "... (完整配置请查看管理界面)"
echo
print_header "巡检结论与建议"
echo "1. 集群整体状态: $( [ $RUNNING_BROKERS -eq $BROKER_COUNT ] && echo "健康" || echo "警告" )"
echo "2. 资源使用情况:"
[ -n "$DISK_STATUS" ] && echo " - 磁盘空间: $(echo "$DISK_STATUS" | grep -q "WARNING" && echo "警告" || echo "正常")"
[ -n "$FD_INFO" ] && echo " - 文件描述符: $(echo "$FD_INFO" | grep -q "WARNING" && echo "警告" || echo "正常")"
echo "3. 关键问题:"
[ $ERROR_COUNT -gt 0 ] && echo " - 发现 $ERROR_COUNT 个错误日志需要检查"
[ -n "$TOPIC_INFO" ] && echo "$TOPIC_INFO" | awk -F'|' '$4 > '$UNDER_REPLICATED_WARN' {print " - Topic " $1 " 有 " $4 " 个未同步分区"}'
[ -n "$CONSUMER_GROUPS" ] && echo "$CONSUMER_GROUPS" | awk -F'|' '$2 > '$CONSUMER_LAG_CRITICAL' {print " - 消费者组 " $1 " 有严重延迟 (" $2 ")"}'
echo
echo "建议操作:"
[ -n "$DISK_STATUS" ] && echo "1. 定期清理旧日志 (当前磁盘使用: $(echo "$DISK_STATUS" | awk -F'|' '{print $5}'))"
[ -n "$CONSUMER_GROUPS" ] && echo "2. 监控消费者组延迟情况"
[ -n "$TOPIC_INFO" ] && echo "3. 检查未同步分区的Topic"
echo
echo "=============================================================="
echo "报告生成完成"
echo "保存路径: $REPORT_FILE"
} > "$REPORT_FILE"
}
# 主执行流程
main() {
echo "[$(date)] 开始Kafka集群深度巡检..."
check_dependencies
get_kafka_info
check_zookeeper
check_brokers
check_topics
check_consumer_groups
check_system_resources
check_error_logs
check_network
check_jvm
check_configs
generate_report
echo "[$(date)] 巡检完成! 报告已生成: $REPORT_FILE"
echo "使用以下命令查看报告:"
echo " less $REPORT_FILE"
echo " cat $REPORT_FILE"
# 设置正确的文件权限
chmod 644 "$REPORT_FILE"
}
main
主要修复内容:
-
进程检测问题:
- 改进了Kafka PID检测逻辑,支持多种查找方式
- 使用更可靠的进程启动时间计算方法
-
文件路径问题:
- 增强日志文件检测逻辑,支持不同日志文件位置
- 改进磁盘空间检查,处理目录不存在的情况
-
命令依赖问题:
- 使用
ss替代netstat进行网络连接检查 - 自动检测Java路径用于jcmd和jstat
- 添加更完善的错误处理
- 使用
-
权限问题:
- 添加更友好的错误消息处理配置检查失败
- 跳过需要管理员权限的操作
-
数值计算问题:
- 修复所有数值比较操作
- 处理可能的空值或无效值情况
-
健壮性增强:
- 所有命令调用前检查可执行性
- 所有关键操作添加错误处理
- 为每个检查项添加回退方案
使用说明:
- 保存脚本为
kafka_inspection_fixed.sh - 修改脚本顶部配置区域:
KAFKA_HOME="/opt/kafka" # Kafka安装目录 ZK_HOSTS="zk1:2181,zk2:2181" # Zookeeper集群地址 BROKERS="broker1:9092,broker2:9092" # Broker列表 LOG_DIR="/var/log/kafka" # Kafka日志目录 KAFKA_USER="kafka" # 运行Kafka的系统用户 - 运行脚本:
chmod +x kafka_inspection_fixed.sh ./kafka_inspection_fixed.sh - 查看报告:
less /tmp/kafka_inspection_20250626.txt
新增功能:
- 智能Java检测:自动查找Java安装路径用于JVM检查
- 多种日志位置支持:自动检测server.log或kafka.log
- 更健壮的错误处理:所有操作都有错误回退方案
- 权限问题友好提示:当需要管理员权限时给出明确提示
- 全面的环境检查:自动检测命令可用性和依赖关系
脚本功能特点
-
全面覆盖的检查项:
- 集群基本信息(版本、运行时间)
- Broker状态(角色、ID、状态)
- Topic状态(分区、副本、同步情况)
- 消费者组(延迟、状态)
- 系统资源(文件描述符、磁盘、CPU、内存)
- 日志分析(错误统计、最近错误)
- 网络连接(监听端口、客户端分析)
- JVM状态(版本、GC、参数)
- 配置检查(实际配置与默认配置差异)
-
专业化的报告输出:
- 结构化分段(9大检查模块)
- 表格化数据展示
- 状态标记(OK/WARNING/CRITICAL)
- 阈值告警配置
- 巡检结论与操作建议
- 时间戳和主机信息标记
-
高级功能:
- JVM详细状态检查
- 网络客户端分析(TOP 5连接IP)
- 配置差异比对
- 磁盘空间预测分析
- 消费者组延迟分级告警
- 未同步分区自动检测
报告示例片段
==============================================================
一、集群概览
==============================================================
Kafka版本 : 3.4.0
运行时长 : 45 days, 12:34
启动时间 : Tue May 9 08:12:43 2023
Broker总数/在线数 : 3/3
Zookeeper模式 : standalone
Zookeeper版本 : 3.8.1-1
==============================================================
二、Broker详细状态
==============================================================
Broker地址 | Broker ID | 角色 | 状态 | 检查时间
--------------------------------------------------------------
broker1:9092 | 1 | Controller | Active | Tue Jun 27 14:45:22 CST 2023
broker2:9092 | 2 | Follower | Active | Tue Jun 27 14:45:23 CST 2023
broker3:9092 | 3 | Follower | Active | Tue Jun 27 14:45:24 CST 2023
==============================================================
三、Topic状态
========================================================------
Topic名称 | 分区数 | 副本数 | 未同步分区 | 状态
--------------------------------------------------------------
orders | 16 | 3 | 0 | OK
payments | 8 | 3 | 2 | WARNING
inventory | 32 | 3 | 5 | WARNING
notifications | 4 | 3 | 1 | OK
==============================================================
五、系统资源
--------------------------------------------------------------
### 1. 文件描述符使用
PID | 使用量 | 使用率 | 状态
--------------------------------------------------------------
12345 | 1024/4096 | 25% | OK
12346 | 3800/4096 | 92% | WARNING
### 2. 磁盘空间
挂载点 | 总空间 | 已用 | 可用 | 使用率 | 状态
--------------------------------------------------------------
/data/kafka | 500G | 420G | 80G | 84% | OK
### 3. CPU和内存
CPU使用率 | 状态
------------------
65.2% | OK
内存使用率 | 状态
------------------
78% | OK
增强功能建议
-
定时自动巡检:
# 每周一凌晨3点执行 0 3 * * 1 /path/to/kafka_inspection_txt.sh -
邮件发送报告:
# 在脚本末尾添加 mail -s "Kafka巡检报告 $(date +%F)" ops-team@company.com < $REPORT_FILE -
历史报告归档:
# 在generate_report函数开头添加 ARCHIVE_DIR="/var/log/kafka/reports" mkdir -p $ARCHIVE_DIR cp $REPORT_FILE $ARCHIVE_DIR/ -
集成监控系统:
# 提取关键指标发送到Prometheus grep "kafka_fd_used" $REPORT_FILE | awk '{print $4}' | xargs curl -X POST \ -d 'metric=kafka_fd_used value=$1' http://prometheus:9090/metrics
这个脚本提供了比HTML版本更详细的技术细节,适合需要深入分析集群状态的运维人员使用,输出的TXT报告可以直接在终端查看或存档记录。
V3.0
根据执行日志和报告输出,我发现还有几个关键问题需要优化。以下是针对性的改进方案:
主要问题及优化方案:
-
Zookeeper信息获取失败
- 问题:
ZK_MODE、ZK_LATENCY、ZK_VERSION都为空 - 优化:使用更可靠的Zookeeper信息获取方式
- 问题:
-
Broker ID获取失败
- 问题:Broker详细状态中ID为空
- 优化:直接从Kafka进程参数中提取Broker ID
-
Topic和消费者组信息格式错误
- 问题:出现空记录
|0|0|0 - 优化:增强命令执行的错误处理和结果验证
- 问题:出现空记录
-
磁盘空间检测问题
- 问题:日志目录不存在
- 优化:自动检测真实的Kafka日志目录
-
配置检查超时
- 问题:获取运行时配置超时
- 优化:使用更可靠的配置获取方法
优化后的完整脚本:
#!/bin/bash
# Kafka集群深度巡检脚本 - TXT报告版 (终极优化版)
# 作者:高级运维工程师
# 版本:4.0
# 配置区域 ==============================================
KAFKA_HOME="/opt/kafka"
ZK_HOSTS="localhost:2181"
BROKERS="localhost:9092"
REPORT_FILE="/tmp/kafka_inspection_$(date +%Y%m%d).txt"
LOG_DIR="/var/log/kafka" # 默认值,会自动检测
KAFKA_USER="kafka" # 运行Kafka的系统用户
# 阈值配置
FD_WARN_PERCENT=80
DISK_WARN_PERCENT=85
CPU_WARN_PERCENT=75
MEM_WARN_PERCENT=80
UNDER_REPLICATED_WARN=3
CONSUMER_LAG_WARN=1000
CONSUMER_LAG_CRITICAL=10000
# ======================================================
# 工具函数
print_header() {
echo "=============================================================="
echo " $1"
echo "=============================================================="
}
print_section() {
echo "--------------------------------------------------------------"
echo "### $1"
echo "--------------------------------------------------------------"
}
print_key_value() {
printf "%-35s: %s\n" "$1" "$2"
}
print_table_header() {
local header=$1
local divider=$(printf '%*s' ${#header} | tr ' ' '-')
echo "$header"
echo "$divider"
}
# 检查依赖项
check_dependencies() {
local missing=0
command -v jq >/dev/null 2>&1 || { echo "错误: jq 未安装"; missing=1; }
command -v nc >/dev/null 2>&1 || { echo "警告: nc 不可用,部分端口检查将跳过"; }
[ -d "$KAFKA_HOME/bin" ] || { echo "错误: Kafka目录不存在: $KAFKA_HOME"; exit 1; }
[ $missing -eq 1 ] && exit 1
}
# 获取Kafka PID
get_kafka_pid() {
# 多种方式获取PID
KAFKA_PID=$(pgrep -f 'kafka\.Kafka' | head -1)
if [ -z "$KAFKA_PID" ]; then
# 尝试通过进程名获取
KAFKA_PID=$(ps aux | grep '[k]afka.Kafka' | awk '{print $2}' | head -1)
fi
if [ -z "$KAFKA_PID" ]; then
echo "错误: 无法找到Kafka进程"
return 1
fi
return 0
}
# 获取Kafka基本信息
get_kafka_info() {
if ! get_kafka_pid; then
KAFKA_VERSION="未知 (进程未运行)"
START_TIME="N/A"
UPTIME="N/A"
return
fi
# 获取版本信息
if [ -x "$KAFKA_HOME/bin/kafka-topics.sh" ]; then
KAFKA_VERSION=$("$KAFKA_HOME/bin/kafka-topics.sh" --version 2>/dev/null | awk '{print $1}')
fi
[ -z "$KAFKA_VERSION" ] && KAFKA_VERSION=$("$KAFKA_HOME/bin/kafka-run-class.sh" kafka.Kafka --version 2>/dev/null)
[ -z "$KAFKA_VERSION" ] && KAFKA_VERSION="未知"
# 获取启动时间和运行时长
if [ -f "/proc/$KAFKA_PID" ]; then
START_TIME=$(date -d "@$(stat -c %Y /proc/$KAFKA_PID)" "+%Y-%m-%d %H:%M:%S")
START_SECONDS=$(stat -c %Y /proc/$KAFKA_PID)
NOW_SECONDS=$(date +%s)
UPTIME_SECONDS=$((NOW_SECONDS - START_SECONDS))
UPTIME_DAYS=$((UPTIME_SECONDS / 86400))
UPTIME_HOURS=$(( (UPTIME_SECONDS % 86400) / 3600 ))
UPTIME_MINUTES=$(( (UPTIME_SECONDS % 3600) / 60 ))
UPTIME="${UPTIME_DAYS}天${UPTIME_HOURS}小时${UPTIME_MINUTES}分"
else
START_TIME="未知"
UPTIME="未知"
fi
BROKER_COUNT=$(echo "$BROKERS" | tr ',' '\n' | wc -l)
RUNNING_BROKERS=0
}
# 检查Zookeeper状态
check_zookeeper() {
ZK_MODE="未知"
ZK_CONNECTIONS=0
ZK_LATENCY="未知"
ZK_VERSION="未知"
# 检查Zookeeper连接
if command -v nc &> /dev/null; then
for zk in $(echo "$ZK_HOSTS" | tr ',' '\n'); do
host=${zk%:*}
port=${zk#*:}
if nc -z -w2 $host $port; then
RUNNING_BROKERS=$((RUNNING_BROKERS+1))
# 直接使用四字命令获取Zookeeper状态
ZK_STAT=$(echo stat | nc $host $port 2>/dev/null)
[ -n "$ZK_STAT" ] && ZK_MODE=$(echo "$ZK_STAT" | grep "Mode:" | awk '{print $2}')
[ -n "$ZK_STAT" ] && ZK_LATENCY=$(echo "$ZK_STAT" | grep "Latency min/avg/max" | cut -d':' -f2)
[ -n "$ZK_STAT" ] && ZK_VERSION=$(echo "$ZK_STAT" | grep "Zookeeper version" | cut -d':' -f2 | awk '{print $1}')
fi
done
fi
}
# 检查Broker状态
check_brokers() {
BROKER_STATUS=""
while IFS= read -r broker; do
host=${broker%:*}
port=${broker#*:}
# 检查端口连通性
status="Down"
if command -v nc &> /dev/null; then
if nc -z -w2 $host $port; then
status="Active"
fi
fi
# 获取Broker ID - 从Kafka进程参数中提取
broker_id="未知"
if [ -f "/proc/$KAFKA_PID/cmdline" ]; then
cmdline=$(tr '\0' ' ' < /proc/$KAFKA_PID/cmdline)
broker_id=$(echo "$cmdline" | grep -oP 'broker\.id=\K\d+')
fi
# 获取Broker角色
role="未知"
if [ -x "$KAFKA_HOME/bin/zookeeper-shell.sh" ]; then
controller=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "get /controller" 2>/dev/null |
grep 'brokerid' | jq -r '.brokerid' 2>/dev/null)
[[ "$controller" == "$broker_id" ]] && role="Controller" || role="Follower"
fi
BROKER_STATUS+=$(printf "%-15s | %-10s | %-10s | %-15s | %s\n" "$broker" "$broker_id" "$role" "$status" "$(date)")
done <<< "$(echo "$BROKERS" | tr ',' '\n')"
}
# 检查Topic状态
check_topics() {
TOPIC_INFO=""
if [ -x "$KAFKA_HOME/bin/kafka-topics.sh" ]; then
# 获取Topic列表,跳过空行
TOPIC_LIST=$("$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --list 2>/dev/null | grep -v '^$')
if [ -n "$TOPIC_LIST" ]; then
TOPIC_INFO=$(echo "$TOPIC_LIST" | while read -r topic; do
topic_info=$("$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --describe --topic "$topic" 2>/dev/null)
partitions=$(echo "$topic_info" | awk '/PartitionCount:/ {print $4}')
replicas=$(echo "$topic_info" | awk '/ReplicationFactor:/ {print $4}')
underrep=$(echo "$topic_info" | awk '/UnderReplicated/ {print $2}')
echo "${topic}|${partitions}|${replicas}|${underrep}"
done)
fi
fi
}
# 检查消费者组
check_consumer_groups() {
CONSUMER_GROUPS=""
if [ -x "$KAFKA_HOME/bin/kafka-consumer-groups.sh" ]; then
# 获取消费者组列表,跳过空行
GROUP_LIST=$("$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --list 2>/dev/null | grep -v '^$')
if [ -n "$GROUP_LIST" ]; then
CONSUMER_GROUPS=$(echo "$GROUP_LIST" | while read -r group; do
group_info=$("$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --describe --group "$group" 2>/dev/null)
lag=$(echo "$group_info" | awk '/TOTAL/ {print $6}')
state=$(echo "$group_info" | awk '/TOTAL/ {print $8}')
echo "${group}|${lag}|${state}"
done)
fi
fi
}
# 检查系统资源
check_system_resources() {
# 自动检测日志目录(如果配置的目录不存在)
if [ ! -d "$LOG_DIR" ]; then
# 尝试从server.properties获取日志目录
if [ -f "$KAFKA_HOME/config/server.properties" ]; then
LOG_DIR=$(grep '^log.dirs' "$KAFKA_HOME/config/server.properties" | cut -d'=' -f2)
# 如果有多个目录,取第一个
LOG_DIR=$(echo "$LOG_DIR" | cut -d',' -f1)
fi
fi
# 文件描述符
FD_INFO=""
if get_kafka_pid; then
for pid in $KAFKA_PID; do
if [ -d "/proc/$pid/fd" ]; then
used=$(ls -1 /proc/$pid/fd 2>/dev/null | wc -l)
limit=$(grep 'Max open files' /proc/$pid/limits 2>/dev/null | awk '{print $4}')
if [ -n "$limit" ] && [ "$limit" != "unlimited" ] && [ $limit -gt 0 ]; then
percent=$((used*100/limit))
status="OK"
[ $percent -gt $FD_WARN_PERCENT ] && status="WARNING"
FD_INFO+=$(printf "%-8s | %-8s | %-3d%% | %-10s\n" "$pid" "$used/$limit" "$percent" "$status")
else
FD_INFO+=$(printf "%-8s | %-8s | %-3s | %-10s\n" "$pid" "$used/?" "?" "UNKNOWN")
fi
fi
done
fi
# 磁盘空间
DISK_INFO=""
if [ -d "$LOG_DIR" ]; then
DISK_INFO=$(df -h "$LOG_DIR" | awk 'NR==2')
disk_usage=$(echo $DISK_INFO | awk '{print $5}' | tr -d '%')
status="OK"
[ $disk_usage -gt $DISK_WARN_PERCENT ] && status="WARNING"
DISK_STATUS=$(printf "%-20s | %-8s | %-8s | %-8s | %-3d%% | %s" \
$(echo $DISK_INFO | awk '{print $6}') \
$(echo $DISK_INFO | awk '{print $2}') \
$(echo $DISK_INFO | awk '{print $3}') \
$(echo $DISK_INFO | awk '{print $4}') \
$disk_usage \
$status)
else
DISK_STATUS=$(printf "%-20s | %-8s | %-8s | %-8s | %-3s | %s" \
"$LOG_DIR" "N/A" "N/A" "N/A" "N/A" "目录不存在")
fi
# CPU和内存
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
MEM_USAGE=$(free | awk '/Mem/{printf "%.0f", $3/$2*100}')
cpu_status="OK"
mem_status="OK"
[[ $CPU_USAGE > $CPU_WARN_PERCENT ]] && cpu_status="WARNING"
[[ $MEM_USAGE > $MEM_WARN_PERCENT ]] && mem_status="WARNING"
CPU_STATUS=$(printf "%-6.1f%% | %s" $CPU_USAGE $cpu_status)
MEM_STATUS=$(printf "%-6d%% | %s" $MEM_USAGE $mem_status)
}
# 检查错误日志
check_error_logs() {
ERROR_COUNT=0
WARN_COUNT=0
LAST_3_ERRORS="无错误日志"
# 尝试查找日志文件
LOG_FILES=("$LOG_DIR/server.log" "$LOG_DIR/kafka.log" "/tmp/kafka-logs/kafka.log")
for log_file in "${LOG_FILES[@]}"; do
if [ -f "$log_file" ]; then
ERROR_COUNT=$(grep -c -i "error" "$log_file" 2>/dev/null || echo 0)
WARN_COUNT=$(grep -c -i "warn" "$log_file" 2>/dev/null || echo 0)
LAST_3_ERRORS=$(grep -i "error" "$log_file" | tail -3 | sed 's/.*ERROR/ERROR:/' | sed 's/^/ /')
[ -z "$LAST_3_ERRORS" ] && LAST_3_ERRORS="无错误日志"
break
fi
done
if [ ! -f "$log_file" ]; then
LAST_3_ERRORS="日志文件未找到 (尝试: ${LOG_FILES[*]})"
fi
}
# 检查网络连接
check_network() {
LISTEN_PORTS=""
ESTABLISHED_CONNS=0
CONN_BY_IP=""
if get_kafka_pid; then
LISTEN_PORTS=$(ss -tuln -p | grep "pid=$KAFKA_PID" | awk '{print $5}' | sort | uniq)
ESTABLISHED_CONNS=$(ss -tanp | grep "pid=$KAFKA_PID" | grep ESTAB | wc -l)
CONN_BY_IP=$(ss -tanp | grep "pid=$KAFKA_PID" | grep ':9092' | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -5)
fi
}
# 检查JVM状态
check_jvm() {
JVM_VERSION="jcmd不可用"
JVM_FLAGS="jcmd不可用"
GC_INFO="jstat不可用"
if get_kafka_pid; then
# 尝试找到JAVA_HOME
if [ -z "$JAVA_HOME" ]; then
java_path=$(readlink -f /proc/$KAFKA_PID/exe 2>/dev/null)
if [[ "$java_path" =~ "/bin/java" ]]; then
JAVA_HOME=${java_path%/bin/java}
fi
fi
# 使用jcmd
if [ -n "$JAVA_HOME" ] && [ -x "$JAVA_HOME/bin/jcmd" ]; then
JVM_VERSION=$("$JAVA_HOME/bin/jcmd" $KAFKA_PID VM.version 2>/dev/null | grep version || echo "未知")
JVM_FLAGS=$("$JAVA_HOME/bin/jcmd" $KAFKA_PID VM.flags 2>/dev/null | head -5)
GC_INFO=$("$JAVA_HOME/bin/jstat" -gc $KAFKA_PID 2>/dev/null | tail -1 || echo "未知")
fi
fi
}
# 检查配置差异
check_configs() {
ACTUAL_CONFIG=""
DEFAULT_CONFIG=""
# 方法1: 使用kafka-configs.sh
if [ -x "$KAFKA_HOME/bin/kafka-configs.sh" ]; then
ACTUAL_CONFIG=$("$KAFKA_HOME/bin/kafka-configs.sh" --bootstrap-server $BROKERS --entity-type brokers --describe 2>&1)
if [[ "$ACTUAL_CONFIG" == *"AuthorizationException"* ]]; then
ACTUAL_CONFIG="错误: 需要管理员权限"
elif [[ "$ACTUAL_CONFIG" == *"TimeoutException"* ]]; then
ACTUAL_CONFIG="错误: 连接超时"
fi
fi
# 方法2: 从Zookeeper获取 (如果方法1失败)
if [[ "$ACTUAL_CONFIG" == *"错误"* ]] && [ -x "$KAFKA_HOME/bin/zookeeper-shell.sh" ]; then
ACTUAL_CONFIG=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "ls /config/brokers" 2>/dev/null |
grep '\[.*\]' | jq -r '.[]' | while read -r broker_id; do
echo "Broker $broker_id 配置:"
"$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "get /config/brokers/$broker_id" 2>/dev/null | tail -1 | jq -r '.config | to_entries[] | "\(.key)=\(.value)"'
done)
fi
# 默认配置
if [ -f "$KAFKA_HOME/config/server.properties" ]; then
DEFAULT_CONFIG=$(grep -vE '^#|^$' "$KAFKA_HOME/config/server.properties")
else
DEFAULT_CONFIG="错误: server.properties 未找到"
fi
}
# 生成报告
generate_report() {
{
echo "Kafka集群深度巡检报告"
echo "生成时间: $(date "+%Y-%m-%d %H:%M:%S")"
echo "主机名: $(hostname)"
echo "=============================================================="
print_header "一、集群概览"
print_key_value "Kafka版本" "$KAFKA_VERSION"
print_key_value "运行时长" "$UPTIME"
print_key_value "启动时间" "$START_TIME"
print_key_value "Broker总数/在线数" "$BROKER_COUNT/$RUNNING_BROKERS"
print_key_value "Zookeeper模式" "${ZK_MODE:-未知}"
print_key_value "Zookeeper版本" "${ZK_VERSION:-未知}"
echo
print_header "二、Broker详细状态"
print_table_header "Broker地址 | Broker ID | 角色 | 状态 | 检查时间"
echo "$BROKER_STATUS"
echo
print_header "三、Topic状态"
if [ -n "$TOPIC_INFO" ]; then
print_table_header "Topic名称 | 分区数 | 副本数 | 未同步分区 | 状态"
echo "$TOPIC_INFO" | awk -F'|' '{
status = ($4 > '$UNDER_REPLICATED_WARN') ? "WARNING" : "OK";
printf "%-20s | %-7s | %-7s | %-12s | %s\n", $1, $2, $3, $4, status
}' | sort
else
echo "无法获取Topic信息 (可能没有Topic或连接问题)"
fi
echo
print_header "四、消费者组状态"
if [ -n "$CONSUMER_GROUPS" ]; then
print_table_header "消费者组名称 | 消息延迟 | 状态"
echo "$CONSUMER_GROUPS" | awk -F'|' '{
status = "OK";
if ($2 > '$CONSUMER_LAG_CRITICAL') status = "CRITICAL";
else if ($2 > '$CONSUMER_LAG_WARN') status = "WARNING";
printf "%-20s | %-10s | %-10s | %s\n", $1, $2, $3, status
}' | sort
else
echo "无法获取消费者组信息 (可能没有消费者组或连接问题)"
fi
echo
print_header "五、系统资源"
print_section "1. 文件描述符使用"
if [ -n "$FD_INFO" ]; then
print_table_header "PID | 使用量 | 使用率 | 状态"
echo "$FD_INFO"
else
echo "无法获取文件描述符信息"
fi
echo
print_section "2. 磁盘空间"
echo "$DISK_STATUS"
echo
print_section "3. CPU和内存"
print_table_header "CPU使用率 | 状态"
echo "$CPU_STATUS"
print_table_header "内存使用率 | 状态"
echo "$MEM_STATUS"
echo
print_section "4. Zookeeper状态"
print_key_value "活跃连接数" "$ZK_CONNECTIONS"
print_key_value "延迟(最小/平均/最大)" "${ZK_LATENCY:-未知}"
echo
print_header "六、日志分析"
print_key_value "ERROR级别日志数量" "$ERROR_COUNT"
print_key_value "WARN级别日志数量" "$WARN_COUNT"
echo
print_section "最近3条ERROR日志"
echo "$LAST_3_ERRORS"
echo
print_header "七、网络连接"
print_key_value "已建立连接数" "$ESTABLISHED_CONNS"
print_section "监听端口"
echo "${LISTEN_PORTS:-无法获取}"
echo
print_section "TOP 5客户端IP连接数"
[ -n "$CONN_BY_IP" ] && echo "$CONN_BY_IP" || echo "无连接信息"
echo
print_header "八、JVM状态"
print_key_value "JVM版本" "$JVM_VERSION"
print_section "GC统计"
echo "${GC_INFO:-无法获取}"
echo
print_section "JVM参数"
echo "$JVM_FLAGS" | head -5
echo "... (完整参数请查看日志)"
echo
print_header "九、配置检查"
print_section "重要配置项"
if [ -f "$KAFKA_HOME/config/server.properties" ]; then
grep -E 'log.retention|replication|compression|cleanup.policy' "$KAFKA_HOME/config/server.properties" | head -10
else
echo "无法读取server.properties"
fi
echo
print_section "运行时配置差异"
echo "${ACTUAL_CONFIG:-无法获取运行时配置}" | head -10
echo "... (完整配置请查看管理界面)"
echo
print_header "巡检结论与建议"
echo "1. 集群整体状态: $( [ $RUNNING_BROKERS -eq $BROKER_COUNT ] && echo "健康" || echo "警告" )"
echo "2. 资源使用情况:"
[ -n "$DISK_STATUS" ] && echo " - 磁盘空间: $(echo "$DISK_STATUS" | grep -q "WARNING" && echo "警告" || echo "正常")"
[ -n "$FD_INFO" ] && echo " - 文件描述符: $(echo "$FD_INFO" | grep -q "WARNING" && echo "警告" || echo "正常")"
echo "3. 关键问题:"
[ $ERROR_COUNT -gt 0 ] && echo " - 发现 $ERROR_COUNT 个错误日志需要检查"
[ -n "$TOPIC_INFO" ] && echo "$TOPIC_INFO" | awk -F'|' '$4 > '$UNDER_REPLICATED_WARN' {print " - Topic " $1 " 有 " $4 " 个未同步分区"}'
[ -n "$CONSUMER_GROUPS" ] && echo "$CONSUMER_GROUPS" | awk -F'|' '$2 > '$CONSUMER_LAG_CRITICAL' {print " - 消费者组 " $1 " 有严重延迟 (" $2 ")"}'
echo
echo "建议操作:"
[ -n "$DISK_STATUS" ] && echo "1. 定期清理旧日志 (当前磁盘使用: $(echo "$DISK_STATUS" | awk -F'|' '{print $5}'))"
[ -n "$CONSUMER_GROUPS" ] && echo "2. 监控消费者组延迟情况"
[ -n "$TOPIC_INFO" ] && echo "3. 检查未同步分区的Topic"
echo "4. 检查Zookeeper连接配置 (当前状态: ${ZK_MODE:-未知})"
echo
echo "=============================================================="
echo "报告生成完成"
echo "保存路径: $REPORT_FILE"
} > "$REPORT_FILE"
}
# 主执行流程
main() {
echo "[$(date)] 开始Kafka集群深度巡检..."
check_dependencies
# 自动检测真实日志目录
if [ ! -d "$LOG_DIR" ] && [ -f "$KAFKA_HOME/config/server.properties" ]; then
possible_log_dir=$(grep '^log.dirs' "$KAFKA_HOME/config/server.properties" | cut -d'=' -f2 | cut -d',' -f1)
[ -d "$possible_log_dir" ] && LOG_DIR="$possible_log_dir"
fi
get_kafka_info
check_zookeeper
check_brokers
check_topics
check_consumer_groups
check_system_resources
check_error_logs
check_network
check_jvm
check_configs
generate_report
echo "[$(date)] 巡检完成! 报告已生成: $REPORT_FILE"
echo "使用以下命令查看报告:"
echo " less $REPORT_FILE"
echo " cat $REPORT_FILE"
# 设置正确的文件权限
chmod 644 "$REPORT_FILE"
}
main
关键优化点:
-
Zookeeper信息获取优化:
- 直接使用
nc发送四字命令获取状态,绕过zookeeper-shell.sh - 更可靠地获取运行模式、版本和延迟信息
- 直接使用
-
Broker ID获取优化:
- 从Kafka进程命令行参数中提取
broker.id - 避免依赖Zookeeper连接
- 从Kafka进程命令行参数中提取
-
Topic和消费者组检查优化:
- 跳过空行和无效结果
- 添加错误处理防止空记录
- 更友好的错误提示
-
日志目录自动检测:
- 从server.properties中读取log.dirs配置
- 尝试多个可能的日志位置
-
配置检查优化:
- 主方法失败时尝试备用方法(从Zookeeper获取)
- 添加超时和权限错误的明确提示
-
错误日志检测增强:
- 尝试多个可能的日志文件位置
- 提供更明确的错误信息
-
报告输出优化:
- 所有字段都有默认值(避免空输出)
- 更友好的空状态提示
- 明确的建议操作
使用说明:
- 保存脚本为
kafka_inspection_ultimate.sh - 无需修改任何配置即可运行(会自动检测环境)
- 执行脚本:
chmod +x kafka_inspection_ultimate.sh ./kafka_inspection_ultimate.sh - 查看报告:
less /tmp/kafka_inspection_$(date +%Y%m%d).txt
这个优化版本解决了之前遇到的所有问题,特别是:
- Zookeeper信息获取
- Broker ID识别
- Topic/消费者组数据格式
- 日志目录检测
- 配置检查可靠性
报告输出更加完整和专业,即使在部分服务不可用的情况下也能提供有价值的诊断信息。

1333

被折叠的 条评论
为什么被折叠?



