Kafka巡检脚本,输出为结构化的TXT文本报告

V2.0

#!/bin/bash
# Kafka集群深度巡检脚本 - TXT报告版 (修复版)
# 作者:高级运维工程师
# 版本:3.0

# 配置区域 ==============================================
KAFKA_HOME="/opt/kafka"
ZK_HOSTS="localhost:2181"
BROKERS="localhost:9092"
REPORT_FILE="/tmp/kafka_inspection_$(date +%Y%m%d).txt"
LOG_DIR="/var/log/kafka"
KAFKA_USER="kafka"  # 运行Kafka的系统用户

# 阈值配置
FD_WARN_PERCENT=80
DISK_WARN_PERCENT=85
CPU_WARN_PERCENT=75
MEM_WARN_PERCENT=80
UNDER_REPLICATED_WARN=3
CONSUMER_LAG_WARN=1000
CONSUMER_LAG_CRITICAL=10000
# ======================================================

# 工具函数
print_header() {
    echo "=============================================================="
    echo " $1"
    echo "=============================================================="
}

print_section() {
    echo "--------------------------------------------------------------"
    echo "### $1"
    echo "--------------------------------------------------------------"
}

print_key_value() {
    printf "%-35s: %s\n" "$1" "$2"
}

print_table_header() {
    local header=$1
    local divider=$(printf '%*s' ${#header} | tr ' ' '-')
    echo "$header"
    echo "$divider"
}

# 检查依赖项
check_dependencies() {
    local missing=0
    command -v jq >/dev/null 2>&1 || { echo "错误: jq 未安装"; missing=1; }
    command -v nc >/dev/null 2>&1 || { echo "警告: nc 不可用,部分端口检查将跳过"; }
    [ -d "$KAFKA_HOME/bin" ] || { echo "错误: Kafka目录不存在: $KAFKA_HOME"; exit 1; }
    [ $missing -eq 1 ] && exit 1
}

# 获取Kafka PID
get_kafka_pid() {
    # 多种方式获取PID
    KAFKA_PID=$(pgrep -f 'kafka\.Kafka' | head -1)
    
    if [ -z "$KAFKA_PID" ]; then
        # 尝试通过进程名获取
        KAFKA_PID=$(ps aux | grep '[k]afka.Kafka' | awk '{print $2}' | head -1)
    fi
    
    if [ -z "$KAFKA_PID" ]; then
        echo "错误: 无法找到Kafka进程"
        return 1
    fi
    return 0
}

# 获取Kafka基本信息
get_kafka_info() {
    if ! get_kafka_pid; then
        KAFKA_VERSION="未知 (进程未运行)"
        START_TIME="N/A"
        UPTIME="N/A"
        return
    fi
    
    KAFKA_VERSION=$("$KAFKA_HOME/bin/kafka-topics.sh" --version 2>/dev/null | awk '{print $1}')
    [ -z "$KAFKA_VERSION" ] && KAFKA_VERSION=$("$KAFKA_HOME/bin/kafka-run-class.sh" kafka.Kafka --version 2>/dev/null)
    [ -z "$KAFKA_VERSION" ] && KAFKA_VERSION="未知"
    
    # 获取启动时间和运行时长
    if [ -f "/proc/$KAFKA_PID" ]; then
        START_TIME=$(date -d "@$(stat -c %Y /proc/$KAFKA_PID)" "+%Y-%m-%d %H:%M:%S")
        START_SECONDS=$(stat -c %Y /proc/$KAFKA_PID)
        NOW_SECONDS=$(date +%s)
        UPTIME_SECONDS=$((NOW_SECONDS - START_SECONDS))
        UPTIME_DAYS=$((UPTIME_SECONDS / 86400))
        UPTIME_HOURS=$(( (UPTIME_SECONDS % 86400) / 3600 ))
        UPTIME_MINUTES=$(( (UPTIME_SECONDS % 3600) / 60 ))
        UPTIME="${UPTIME_DAYS}${UPTIME_HOURS}小时${UPTIME_MINUTES}分"
    else
        START_TIME="未知"
        UPTIME="未知"
    fi
    
    BROKER_COUNT=$(echo "$BROKERS" | tr ',' '\n' | wc -l)
    RUNNING_BROKERS=0
}

# 检查Zookeeper状态
check_zookeeper() {
    ZK_MODE="未知"
    ZK_CONNECTIONS=0
    ZK_LATENCY="未知"
    ZK_VERSION="未知"
    
    # 检查Zookeeper连接
    if command -v nc &> /dev/null; then
        for zk in $(echo "$ZK_HOSTS" | tr ',' '\n'); do
            host=${zk%:*}
            port=${zk#*:}
            if nc -z -w2 $host $port; then
                RUNNING_BROKERS=$((RUNNING_BROKERS+1))
            fi
        done
    fi
    
    # 获取Zookeeper详细信息
    if [ -x "$KAFKA_HOME/bin/zookeeper-shell.sh" ]; then
        ZK_MODE=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "stat" 2>/dev/null | grep "Mode:" | cut -d' ' -f2)
        ZK_LATENCY=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "stat" 2>/dev/null | grep "Latency min/avg/max" | cut -d':' -f2)
        ZK_VERSION=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "stat" 2>/dev/null | grep "Zookeeper version" | cut -d':' -f2)
    fi
}

# 检查Broker状态
check_brokers() {
    BROKER_STATUS=""
    while IFS= read -r broker; do
        host=${broker%:*}
        port=${broker#*:}
        
        # 检查端口连通性
        status="Down"
        if command -v nc &> /dev/null; then
            if nc -z -w2 $host $port; then
                status="Active"
            fi
        fi
        
        # 尝试获取Broker ID
        broker_id="未知"
        if [ -x "$KAFKA_HOME/bin/zookeeper-shell.sh" ]; then
            broker_id=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "get /brokers/ids" 2>/dev/null | 
                grep -A1 "$host" | grep '"host"' | jq -r '.id' 2>/dev/null)
        fi
        
        # 尝试获取Broker角色
        role="未知"
        if [ "$broker_id" != "未知" ]; then
            controller=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "get /controller" 2>/dev/null | 
                grep 'brokerid' | jq -r '.brokerid' 2>/dev/null)
            [[ "$controller" == "$broker_id" ]] && role="Controller" || role="Follower"
        fi
        
        BROKER_STATUS+=$(printf "%-15s | %-10s | %-10s | %-15s | %s\n" "$broker" "$broker_id" "$role" "$status" "$(date)")
    done <<< "$(echo "$BROKERS" | tr ',' '\n')"
}

# 检查Topic状态
check_topics() {
    TOPIC_INFO=""
    if [ -x "$KAFKA_HOME/bin/kafka-topics.sh" ]; then
        TOPIC_INFO=$("$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --list 2>/dev/null | \
            xargs -I{} "$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --describe --topic {} 2>/dev/null | \
            awk '
                /Topic:/ {topic=$2; partitions=0; replicas=0}
                /PartitionCount:/ {partitions=$4; replicas=$6}
                /UnderReplicated/ {underrep=$2}
                END {printf "%s|%d|%d|%d\n", topic, partitions, replicas, underrep}
            ')
    fi
}

# 检查消费者组
check_consumer_groups() {
    CONSUMER_GROUPS=""
    if [ -x "$KAFKA_HOME/bin/kafka-consumer-groups.sh" ]; then
        CONSUMER_GROUPS=$("$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --list 2>/dev/null | \
            xargs -I{} "$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --describe --group {} 2>/dev/null | \
            awk '
                /GROUP/ {group=$3; lag=0; state=""}
                /TOTAL/ {lag=$6; state=$8}
                END {printf "%s|%d|%s\n", group, lag, state}
            ')
    fi
}

# 检查系统资源
check_system_resources() {
    # 文件描述符
    FD_INFO=""
    if get_kafka_pid; then
        for pid in $KAFKA_PID; do
            if [ -d "/proc/$pid/fd" ]; then
                used=$(ls -1 /proc/$pid/fd 2>/dev/null | wc -l)
                limit=$(grep 'Max open files' /proc/$pid/limits 2>/dev/null | awk '{print $4}')
                if [ -n "$limit" ] && [ "$limit" != "unlimited" ] && [ $limit -gt 0 ]; then
                    percent=$((used*100/limit))
                    status="OK"
                    [ $percent -gt $FD_WARN_PERCENT ] && status="WARNING"
                    FD_INFO+=$(printf "%-8s | %-8s | %-3d%% | %-10s\n" "$pid" "$used/$limit" "$percent" "$status")
                else
                    FD_INFO+=$(printf "%-8s | %-8s | %-3s | %-10s\n" "$pid" "$used/?" "?" "UNKNOWN")
                fi
            fi
        done
    fi
    
    # 磁盘空间
    DISK_INFO=""
    if [ -d "$LOG_DIR" ]; then
        DISK_INFO=$(df -h $LOG_DIR | awk 'NR==2')
        disk_usage=$(echo $DISK_INFO | awk '{print $5}' | tr -d '%')
        status="OK"
        [ $disk_usage -gt $DISK_WARN_PERCENT ] && status="WARNING"
        DISK_STATUS=$(printf "%-20s | %-8s | %-8s | %-8s | %-3d%% | %s" \
            $(echo $DISK_INFO | awk '{print $6}') \
            $(echo $DISK_INFO | awk '{print $2}') \
            $(echo $DISK_INFO | awk '{print $3}') \
            $(echo $DISK_INFO | awk '{print $4}') \
            $disk_usage \
            $status)
    else
        DISK_STATUS=$(printf "%-20s | %-8s | %-8s | %-8s | %-3s | %s" \
            "$LOG_DIR" "N/A" "N/A" "N/A" "N/A" "目录不存在")
    fi
    
    # CPU和内存
    CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
    MEM_USAGE=$(free | awk '/Mem/{printf "%.0f", $3/$2*100}')
    cpu_status="OK"
    mem_status="OK"
    [[ $CPU_USAGE > $CPU_WARN_PERCENT ]] && cpu_status="WARNING"
    [[ $MEM_USAGE > $MEM_WARN_PERCENT ]] && mem_status="WARNING"
    
    CPU_STATUS=$(printf "%-6.1f%% | %s" $CPU_USAGE $cpu_status)
    MEM_STATUS=$(printf "%-6d%% | %s" $MEM_USAGE $mem_status)
}

# 检查错误日志
check_error_logs() {
    ERROR_COUNT=0
    WARN_COUNT=0
    LAST_3_ERRORS="无错误日志"
    
    if [ -f "$LOG_DIR/server.log" ]; then
        ERROR_COUNT=$(grep -c -i "error" $LOG_DIR/server.log 2>/dev/null || echo 0)
        WARN_COUNT=$(grep -c -i "warn" $LOG_DIR/server.log 2>/dev/null || echo 0)
        LAST_3_ERRORS=$(grep -i "error" $LOG_DIR/server.log | tail -3 | sed 's/.*ERROR/ERROR:/' | sed 's/^/    /')
        [ -z "$LAST_3_ERRORS" ] && LAST_3_ERRORS="无错误日志"
    elif [ -f "$LOG_DIR/kafka.log" ]; then
        ERROR_COUNT=$(grep -c -i "error" $LOG_DIR/kafka.log 2>/dev/null || echo 0)
        WARN_COUNT=$(grep -c -i "warn" $LOG_DIR/kafka.log 2>/dev/null || echo 0)
        LAST_3_ERRORS=$(grep -i "error" $LOG_DIR/kafka.log | tail -3 | sed 's/.*ERROR/ERROR:/' | sed 's/^/    /')
        [ -z "$LAST_3_ERRORS" ] && LAST_3_ERRORS="无错误日志"
    else
        LAST_3_ERRORS="日志文件未找到"
    fi
}

# 检查网络连接
check_network() {
    LISTEN_PORTS=""
    ESTABLISHED_CONNS=0
    CONN_BY_IP=""
    
    if get_kafka_pid; then
        LISTEN_PORTS=$(ss -tuln -p | grep "pid=$KAFKA_PID" | awk '{print $5}' | sort | uniq)
        ESTABLISHED_CONNS=$(ss -tanp | grep "pid=$KAFKA_PID" | grep ESTAB | wc -l)
        CONN_BY_IP=$(ss -tanp | grep "pid=$KAFKA_PID" | grep ':9092' | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -5)
    fi
}

# 检查JVM状态
check_jvm() {
    JVM_VERSION="jcmd不可用"
    JVM_FLAGS="jcmd不可用"
    GC_INFO="jstat不可用"
    
    if get_kafka_pid; then
        # 尝试找到JAVA_HOME
        if [ -z "$JAVA_HOME" ]; then
            java_path=$(readlink -f /proc/$KAFKA_PID/exe 2>/dev/null)
            if [[ "$java_path" =~ "/bin/java" ]]; then
                JAVA_HOME=${java_path%/bin/java}
            fi
        fi
        
        # 使用jcmd
        if [ -n "$JAVA_HOME" ] && [ -x "$JAVA_HOME/bin/jcmd" ]; then
            JVM_VERSION=$("$JAVA_HOME/bin/jcmd" $KAFKA_PID VM.version 2>/dev/null | grep version || echo "未知")
            JVM_FLAGS=$("$JAVA_HOME/bin/jcmd" $KAFKA_PID VM.flags 2>/dev/null | head -5)
            GC_INFO=$("$JAVA_HOME/bin/jstat" -gc $KAFKA_PID 2>/dev/null | tail -1 || echo "未知")
        fi
    fi
}

# 检查配置差异
check_configs() {
    ACTUAL_CONFIG=""
    DEFAULT_CONFIG=""
    
    if [ -x "$KAFKA_HOME/bin/kafka-configs.sh" ]; then
        ACTUAL_CONFIG=$("$KAFKA_HOME/bin/kafka-configs.sh" --bootstrap-server $BROKERS --entity-type brokers --describe 2>&1)
        if [[ "$ACTUAL_CONFIG" == *"AuthorizationException"* ]]; then
            ACTUAL_CONFIG="错误: 需要管理员权限执行此操作"
        fi
    else
        ACTUAL_CONFIG="错误: kafka-configs.sh 未找到"
    fi
    
    if [ -f "$KAFKA_HOME/config/server.properties" ]; then
        DEFAULT_CONFIG=$(grep -vE '^#|^$' "$KAFKA_HOME/config/server.properties" | head -10)
    else
        DEFAULT_CONFIG="错误: server.properties 未找到"
    fi
}

# 生成报告
generate_report() {
    {
        echo "Kafka集群深度巡检报告"
        echo "生成时间: $(date "+%Y-%m-%d %H:%M:%S")"
        echo "主机名: $(hostname)"
        echo "=============================================================="
        
        print_header "一、集群概览"
        print_key_value "Kafka版本" "$KAFKA_VERSION"
        print_key_value "运行时长" "$UPTIME"
        print_key_value "启动时间" "$START_TIME"
        print_key_value "Broker总数/在线数" "$BROKER_COUNT/$RUNNING_BROKERS"
        print_key_value "Zookeeper模式" "$ZK_MODE"
        print_key_value "Zookeeper版本" "$ZK_VERSION"
        echo
        
        print_header "二、Broker详细状态"
        print_table_header "Broker地址     | Broker ID | 角色       | 状态    | 检查时间"
        echo "$BROKER_STATUS"
        echo
        
        print_header "三、Topic状态"
        if [ -n "$TOPIC_INFO" ]; then
            print_table_header "Topic名称         | 分区数 | 副本数 | 未同步分区 | 状态"
            echo "$TOPIC_INFO" | awk -F'|' '{
                status = ($4 > '$UNDER_REPLICATED_WARN') ? "WARNING" : "OK";
                printf "%-20s | %-7s | %-7s | %-12s | %s\n", $1, $2, $3, $4, status
            }' | sort
        else
            echo "无法获取Topic信息"
        fi
        echo
        
        print_header "四、消费者组状态"
        if [ -n "$CONSUMER_GROUPS" ]; then
            print_table_header "消费者组名称         | 消息延迟 | 状态"
            echo "$CONSUMER_GROUPS" | awk -F'|' '{
                status = "OK";
                if ($2 > '$CONSUMER_LAG_CRITICAL') status = "CRITICAL";
                else if ($2 > '$CONSUMER_LAG_WARN') status = "WARNING";
                printf "%-20s | %-10s | %-10s | %s\n", $1, $2, $3, status
            }' | sort
        else
            echo "无法获取消费者组信息"
        fi
        echo
        
        print_header "五、系统资源"
        print_section "1. 文件描述符使用"
        if [ -n "$FD_INFO" ]; then
            print_table_header "PID     | 使用量   | 使用率 | 状态"
            echo "$FD_INFO"
        else
            echo "无法获取文件描述符信息"
        fi
        echo
        
        print_section "2. 磁盘空间"
        echo "$DISK_STATUS"
        echo
        
        print_section "3. CPU和内存"
        print_table_header "CPU使用率 | 状态"
        echo "$CPU_STATUS"
        print_table_header "内存使用率 | 状态"
        echo "$MEM_STATUS"
        echo
        
        print_section "4. Zookeeper状态"
        print_key_value "活跃连接数" "$ZK_CONNECTIONS"
        print_key_value "延迟(最小/平均/最大)" "$ZK_LATENCY"
        echo
        
        print_header "六、日志分析"
        print_key_value "ERROR级别日志数量" "$ERROR_COUNT"
        print_key_value "WARN级别日志数量" "$WARN_COUNT"
        echo
        print_section "最近3条ERROR日志"
        echo "$LAST_3_ERRORS"
        echo
        
        print_header "七、网络连接"
        print_key_value "已建立连接数" "$ESTABLISHED_CONNS"
        print_section "监听端口"
        echo "$LISTEN_PORTS"
        echo
        print_section "TOP 5客户端IP连接数"
        [ -n "$CONN_BY_IP" ] && echo "$CONN_BY_IP" || echo "无连接信息"
        echo
        
        print_header "八、JVM状态"
        print_key_value "JVM版本" "$JVM_VERSION"
        print_section "GC统计"
        echo "$GC_INFO"
        echo
        print_section "JVM参数"
        echo "$JVM_FLAGS" | head -5
        echo "... (完整参数请查看日志)"
        echo
        
        print_header "九、配置检查"
        print_section "重要配置项"
        if [ -f "$KAFKA_HOME/config/server.properties" ]; then
            grep -E 'log.retention|replication|compression|cleanup.policy' "$KAFKA_HOME/config/server.properties" | head -10
        else
            echo "无法读取server.properties"
        fi
        echo
        print_section "运行时配置差异"
        echo "$ACTUAL_CONFIG" | head -10
        echo "... (完整配置请查看管理界面)"
        echo
        
        print_header "巡检结论与建议"
        echo "1. 集群整体状态: $( [ $RUNNING_BROKERS -eq $BROKER_COUNT ] && echo "健康" || echo "警告" )"
        echo "2. 资源使用情况:"
        [ -n "$DISK_STATUS" ] && echo "   - 磁盘空间: $(echo "$DISK_STATUS" | grep -q "WARNING" && echo "警告" || echo "正常")"
        [ -n "$FD_INFO" ] && echo "   - 文件描述符: $(echo "$FD_INFO" | grep -q "WARNING" && echo "警告" || echo "正常")"
        echo "3. 关键问题:"
        [ $ERROR_COUNT -gt 0 ] && echo "   - 发现 $ERROR_COUNT 个错误日志需要检查"
        [ -n "$TOPIC_INFO" ] && echo "$TOPIC_INFO" | awk -F'|' '$4 > '$UNDER_REPLICATED_WARN' {print "   - Topic " $1 " 有 " $4 " 个未同步分区"}'
        [ -n "$CONSUMER_GROUPS" ] && echo "$CONSUMER_GROUPS" | awk -F'|' '$2 > '$CONSUMER_LAG_CRITICAL' {print "   - 消费者组 " $1 " 有严重延迟 (" $2 ")"}'
        echo
        echo "建议操作:"
        [ -n "$DISK_STATUS" ] && echo "1. 定期清理旧日志 (当前磁盘使用: $(echo "$DISK_STATUS" | awk -F'|' '{print $5}'))"
        [ -n "$CONSUMER_GROUPS" ] && echo "2. 监控消费者组延迟情况"
        [ -n "$TOPIC_INFO" ] && echo "3. 检查未同步分区的Topic"
        echo
        
        echo "=============================================================="
        echo "报告生成完成"
        echo "保存路径: $REPORT_FILE"
        
    } > "$REPORT_FILE"
}

# 主执行流程
main() {
    echo "[$(date)] 开始Kafka集群深度巡检..."
    check_dependencies
    
    get_kafka_info
    check_zookeeper
    check_brokers
    check_topics
    check_consumer_groups
    check_system_resources
    check_error_logs
    check_network
    check_jvm
    check_configs
    
    generate_report
    
    echo "[$(date)] 巡检完成! 报告已生成: $REPORT_FILE"
    echo "使用以下命令查看报告:"
    echo "  less $REPORT_FILE"
    echo "  cat $REPORT_FILE"
    
    # 设置正确的文件权限
    chmod 644 "$REPORT_FILE"
}

main

主要修复内容:

  1. 进程检测问题

    • 改进了Kafka PID检测逻辑,支持多种查找方式
    • 使用更可靠的进程启动时间计算方法
  2. 文件路径问题

    • 增强日志文件检测逻辑,支持不同日志文件位置
    • 改进磁盘空间检查,处理目录不存在的情况
  3. 命令依赖问题

    • 使用 ss 替代 netstat 进行网络连接检查
    • 自动检测Java路径用于jcmd和jstat
    • 添加更完善的错误处理
  4. 权限问题

    • 添加更友好的错误消息处理配置检查失败
    • 跳过需要管理员权限的操作
  5. 数值计算问题

    • 修复所有数值比较操作
    • 处理可能的空值或无效值情况
  6. 健壮性增强

    • 所有命令调用前检查可执行性
    • 所有关键操作添加错误处理
    • 为每个检查项添加回退方案

使用说明:

  1. 保存脚本为 kafka_inspection_fixed.sh
  2. 修改脚本顶部配置区域:
    KAFKA_HOME="/opt/kafka"           # Kafka安装目录
    ZK_HOSTS="zk1:2181,zk2:2181"     # Zookeeper集群地址
    BROKERS="broker1:9092,broker2:9092" # Broker列表
    LOG_DIR="/var/log/kafka"          # Kafka日志目录
    KAFKA_USER="kafka"                # 运行Kafka的系统用户
    
  3. 运行脚本:
    chmod +x kafka_inspection_fixed.sh
    ./kafka_inspection_fixed.sh
    
  4. 查看报告:
    less /tmp/kafka_inspection_20250626.txt
    

新增功能:

  1. 智能Java检测:自动查找Java安装路径用于JVM检查
  2. 多种日志位置支持:自动检测server.log或kafka.log
  3. 更健壮的错误处理:所有操作都有错误回退方案
  4. 权限问题友好提示:当需要管理员权限时给出明确提示
  5. 全面的环境检查:自动检测命令可用性和依赖关系

脚本功能特点

  1. 全面覆盖的检查项

    • 集群基本信息(版本、运行时间)
    • Broker状态(角色、ID、状态)
    • Topic状态(分区、副本、同步情况)
    • 消费者组(延迟、状态)
    • 系统资源(文件描述符、磁盘、CPU、内存)
    • 日志分析(错误统计、最近错误)
    • 网络连接(监听端口、客户端分析)
    • JVM状态(版本、GC、参数)
    • 配置检查(实际配置与默认配置差异)
  2. 专业化的报告输出

    • 结构化分段(9大检查模块)
    • 表格化数据展示
    • 状态标记(OK/WARNING/CRITICAL)
    • 阈值告警配置
    • 巡检结论与操作建议
    • 时间戳和主机信息标记
  3. 高级功能

    • JVM详细状态检查
    • 网络客户端分析(TOP 5连接IP)
    • 配置差异比对
    • 磁盘空间预测分析
    • 消费者组延迟分级告警
    • 未同步分区自动检测

报告示例片段

==============================================================
 一、集群概览
==============================================================
Kafka版本              : 3.4.0
运行时长               : 45 days, 12:34
启动时间               : Tue May  9 08:12:43 2023
Broker总数/在线数      : 3/3
Zookeeper模式          : standalone
Zookeeper版本          : 3.8.1-1

==============================================================
 二、Broker详细状态
==============================================================
Broker地址     | Broker ID | 角色       | 状态    | 检查时间
--------------------------------------------------------------
broker1:9092  | 1         | Controller | Active  | Tue Jun 27 14:45:22 CST 2023
broker2:9092  | 2         | Follower   | Active  | Tue Jun 27 14:45:23 CST 2023
broker3:9092  | 3         | Follower   | Active  | Tue Jun 27 14:45:24 CST 2023

==============================================================
 三、Topic状态
========================================================------
Topic名称         | 分区数 | 副本数 | 未同步分区 | 状态
--------------------------------------------------------------
orders           | 16     | 3      | 0          | OK
payments         | 8      | 3      | 2          | WARNING
inventory        | 32     | 3      | 5          | WARNING
notifications    | 4      | 3      | 1          | OK

==============================================================
 五、系统资源
--------------------------------------------------------------
### 1. 文件描述符使用
PID     | 使用量   | 使用率 | 状态
--------------------------------------------------------------
12345   | 1024/4096 | 25%  | OK
12346   | 3800/4096 | 92%  | WARNING

### 2. 磁盘空间
挂载点              | 总空间  | 已用    | 可用    | 使用率 | 状态
--------------------------------------------------------------
/data/kafka        | 500G    | 420G    | 80G     | 84%   | OK

### 3. CPU和内存
CPU使用率 | 状态
------------------
65.2%   | OK

内存使用率 | 状态
------------------
78%     | OK

增强功能建议

  1. 定时自动巡检

    # 每周一凌晨3点执行
    0 3 * * 1 /path/to/kafka_inspection_txt.sh
    
  2. 邮件发送报告

    # 在脚本末尾添加
    mail -s "Kafka巡检报告 $(date +%F)" ops-team@company.com < $REPORT_FILE
    
  3. 历史报告归档

    # 在generate_report函数开头添加
    ARCHIVE_DIR="/var/log/kafka/reports"
    mkdir -p $ARCHIVE_DIR
    cp $REPORT_FILE $ARCHIVE_DIR/
    
  4. 集成监控系统

    # 提取关键指标发送到Prometheus
    grep "kafka_fd_used" $REPORT_FILE | awk '{print $4}' | xargs curl -X POST \
      -d 'metric=kafka_fd_used value=$1' http://prometheus:9090/metrics
    

这个脚本提供了比HTML版本更详细的技术细节,适合需要深入分析集群状态的运维人员使用,输出的TXT报告可以直接在终端查看或存档记录。

V3.0

根据执行日志和报告输出,我发现还有几个关键问题需要优化。以下是针对性的改进方案:

主要问题及优化方案:

  1. Zookeeper信息获取失败

    • 问题:ZK_MODEZK_LATENCYZK_VERSION都为空
    • 优化:使用更可靠的Zookeeper信息获取方式
  2. Broker ID获取失败

    • 问题:Broker详细状态中ID为空
    • 优化:直接从Kafka进程参数中提取Broker ID
  3. Topic和消费者组信息格式错误

    • 问题:出现空记录 |0|0|0
    • 优化:增强命令执行的错误处理和结果验证
  4. 磁盘空间检测问题

    • 问题:日志目录不存在
    • 优化:自动检测真实的Kafka日志目录
  5. 配置检查超时

    • 问题:获取运行时配置超时
    • 优化:使用更可靠的配置获取方法

优化后的完整脚本:

#!/bin/bash
# Kafka集群深度巡检脚本 - TXT报告版 (终极优化版)
# 作者:高级运维工程师
# 版本:4.0

# 配置区域 ==============================================
KAFKA_HOME="/opt/kafka"
ZK_HOSTS="localhost:2181"
BROKERS="localhost:9092"
REPORT_FILE="/tmp/kafka_inspection_$(date +%Y%m%d).txt"
LOG_DIR="/var/log/kafka"  # 默认值,会自动检测
KAFKA_USER="kafka"        # 运行Kafka的系统用户

# 阈值配置
FD_WARN_PERCENT=80
DISK_WARN_PERCENT=85
CPU_WARN_PERCENT=75
MEM_WARN_PERCENT=80
UNDER_REPLICATED_WARN=3
CONSUMER_LAG_WARN=1000
CONSUMER_LAG_CRITICAL=10000
# ======================================================

# 工具函数
print_header() {
    echo "=============================================================="
    echo " $1"
    echo "=============================================================="
}

print_section() {
    echo "--------------------------------------------------------------"
    echo "### $1"
    echo "--------------------------------------------------------------"
}

print_key_value() {
    printf "%-35s: %s\n" "$1" "$2"
}

print_table_header() {
    local header=$1
    local divider=$(printf '%*s' ${#header} | tr ' ' '-')
    echo "$header"
    echo "$divider"
}

# 检查依赖项
check_dependencies() {
    local missing=0
    command -v jq >/dev/null 2>&1 || { echo "错误: jq 未安装"; missing=1; }
    command -v nc >/dev/null 2>&1 || { echo "警告: nc 不可用,部分端口检查将跳过"; }
    [ -d "$KAFKA_HOME/bin" ] || { echo "错误: Kafka目录不存在: $KAFKA_HOME"; exit 1; }
    [ $missing -eq 1 ] && exit 1
}

# 获取Kafka PID
get_kafka_pid() {
    # 多种方式获取PID
    KAFKA_PID=$(pgrep -f 'kafka\.Kafka' | head -1)
    
    if [ -z "$KAFKA_PID" ]; then
        # 尝试通过进程名获取
        KAFKA_PID=$(ps aux | grep '[k]afka.Kafka' | awk '{print $2}' | head -1)
    fi
    
    if [ -z "$KAFKA_PID" ]; then
        echo "错误: 无法找到Kafka进程"
        return 1
    fi
    return 0
}

# 获取Kafka基本信息
get_kafka_info() {
    if ! get_kafka_pid; then
        KAFKA_VERSION="未知 (进程未运行)"
        START_TIME="N/A"
        UPTIME="N/A"
        return
    fi
    
    # 获取版本信息
    if [ -x "$KAFKA_HOME/bin/kafka-topics.sh" ]; then
        KAFKA_VERSION=$("$KAFKA_HOME/bin/kafka-topics.sh" --version 2>/dev/null | awk '{print $1}')
    fi
    [ -z "$KAFKA_VERSION" ] && KAFKA_VERSION=$("$KAFKA_HOME/bin/kafka-run-class.sh" kafka.Kafka --version 2>/dev/null)
    [ -z "$KAFKA_VERSION" ] && KAFKA_VERSION="未知"
    
    # 获取启动时间和运行时长
    if [ -f "/proc/$KAFKA_PID" ]; then
        START_TIME=$(date -d "@$(stat -c %Y /proc/$KAFKA_PID)" "+%Y-%m-%d %H:%M:%S")
        START_SECONDS=$(stat -c %Y /proc/$KAFKA_PID)
        NOW_SECONDS=$(date +%s)
        UPTIME_SECONDS=$((NOW_SECONDS - START_SECONDS))
        UPTIME_DAYS=$((UPTIME_SECONDS / 86400))
        UPTIME_HOURS=$(( (UPTIME_SECONDS % 86400) / 3600 ))
        UPTIME_MINUTES=$(( (UPTIME_SECONDS % 3600) / 60 ))
        UPTIME="${UPTIME_DAYS}${UPTIME_HOURS}小时${UPTIME_MINUTES}分"
    else
        START_TIME="未知"
        UPTIME="未知"
    fi
    
    BROKER_COUNT=$(echo "$BROKERS" | tr ',' '\n' | wc -l)
    RUNNING_BROKERS=0
}

# 检查Zookeeper状态
check_zookeeper() {
    ZK_MODE="未知"
    ZK_CONNECTIONS=0
    ZK_LATENCY="未知"
    ZK_VERSION="未知"
    
    # 检查Zookeeper连接
    if command -v nc &> /dev/null; then
        for zk in $(echo "$ZK_HOSTS" | tr ',' '\n'); do
            host=${zk%:*}
            port=${zk#*:}
            if nc -z -w2 $host $port; then
                RUNNING_BROKERS=$((RUNNING_BROKERS+1))
                
                # 直接使用四字命令获取Zookeeper状态
                ZK_STAT=$(echo stat | nc $host $port 2>/dev/null)
                [ -n "$ZK_STAT" ] && ZK_MODE=$(echo "$ZK_STAT" | grep "Mode:" | awk '{print $2}')
                [ -n "$ZK_STAT" ] && ZK_LATENCY=$(echo "$ZK_STAT" | grep "Latency min/avg/max" | cut -d':' -f2)
                [ -n "$ZK_STAT" ] && ZK_VERSION=$(echo "$ZK_STAT" | grep "Zookeeper version" | cut -d':' -f2 | awk '{print $1}')
            fi
        done
    fi
}

# 检查Broker状态
check_brokers() {
    BROKER_STATUS=""
    while IFS= read -r broker; do
        host=${broker%:*}
        port=${broker#*:}
        
        # 检查端口连通性
        status="Down"
        if command -v nc &> /dev/null; then
            if nc -z -w2 $host $port; then
                status="Active"
            fi
        fi
        
        # 获取Broker ID - 从Kafka进程参数中提取
        broker_id="未知"
        if [ -f "/proc/$KAFKA_PID/cmdline" ]; then
            cmdline=$(tr '\0' ' ' < /proc/$KAFKA_PID/cmdline)
            broker_id=$(echo "$cmdline" | grep -oP 'broker\.id=\K\d+')
        fi
        
        # 获取Broker角色
        role="未知"
        if [ -x "$KAFKA_HOME/bin/zookeeper-shell.sh" ]; then
            controller=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "get /controller" 2>/dev/null | 
                grep 'brokerid' | jq -r '.brokerid' 2>/dev/null)
            [[ "$controller" == "$broker_id" ]] && role="Controller" || role="Follower"
        fi
        
        BROKER_STATUS+=$(printf "%-15s | %-10s | %-10s | %-15s | %s\n" "$broker" "$broker_id" "$role" "$status" "$(date)")
    done <<< "$(echo "$BROKERS" | tr ',' '\n')"
}

# 检查Topic状态
check_topics() {
    TOPIC_INFO=""
    if [ -x "$KAFKA_HOME/bin/kafka-topics.sh" ]; then
        # 获取Topic列表,跳过空行
        TOPIC_LIST=$("$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --list 2>/dev/null | grep -v '^$')
        if [ -n "$TOPIC_LIST" ]; then
            TOPIC_INFO=$(echo "$TOPIC_LIST" | while read -r topic; do
                topic_info=$("$KAFKA_HOME/bin/kafka-topics.sh" --bootstrap-server "$BROKERS" --describe --topic "$topic" 2>/dev/null)
                partitions=$(echo "$topic_info" | awk '/PartitionCount:/ {print $4}')
                replicas=$(echo "$topic_info" | awk '/ReplicationFactor:/ {print $4}')
                underrep=$(echo "$topic_info" | awk '/UnderReplicated/ {print $2}')
                echo "${topic}|${partitions}|${replicas}|${underrep}"
            done)
        fi
    fi
}

# 检查消费者组
check_consumer_groups() {
    CONSUMER_GROUPS=""
    if [ -x "$KAFKA_HOME/bin/kafka-consumer-groups.sh" ]; then
        # 获取消费者组列表,跳过空行
        GROUP_LIST=$("$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --list 2>/dev/null | grep -v '^$')
        if [ -n "$GROUP_LIST" ]; then
            CONSUMER_GROUPS=$(echo "$GROUP_LIST" | while read -r group; do
                group_info=$("$KAFKA_HOME/bin/kafka-consumer-groups.sh" --bootstrap-server "$BROKERS" --describe --group "$group" 2>/dev/null)
                lag=$(echo "$group_info" | awk '/TOTAL/ {print $6}')
                state=$(echo "$group_info" | awk '/TOTAL/ {print $8}')
                echo "${group}|${lag}|${state}"
            done)
        fi
    fi
}

# 检查系统资源
check_system_resources() {
    # 自动检测日志目录(如果配置的目录不存在)
    if [ ! -d "$LOG_DIR" ]; then
        # 尝试从server.properties获取日志目录
        if [ -f "$KAFKA_HOME/config/server.properties" ]; then
            LOG_DIR=$(grep '^log.dirs' "$KAFKA_HOME/config/server.properties" | cut -d'=' -f2)
            # 如果有多个目录,取第一个
            LOG_DIR=$(echo "$LOG_DIR" | cut -d',' -f1)
        fi
    fi
    
    # 文件描述符
    FD_INFO=""
    if get_kafka_pid; then
        for pid in $KAFKA_PID; do
            if [ -d "/proc/$pid/fd" ]; then
                used=$(ls -1 /proc/$pid/fd 2>/dev/null | wc -l)
                limit=$(grep 'Max open files' /proc/$pid/limits 2>/dev/null | awk '{print $4}')
                if [ -n "$limit" ] && [ "$limit" != "unlimited" ] && [ $limit -gt 0 ]; then
                    percent=$((used*100/limit))
                    status="OK"
                    [ $percent -gt $FD_WARN_PERCENT ] && status="WARNING"
                    FD_INFO+=$(printf "%-8s | %-8s | %-3d%% | %-10s\n" "$pid" "$used/$limit" "$percent" "$status")
                else
                    FD_INFO+=$(printf "%-8s | %-8s | %-3s | %-10s\n" "$pid" "$used/?" "?" "UNKNOWN")
                fi
            fi
        done
    fi
    
    # 磁盘空间
    DISK_INFO=""
    if [ -d "$LOG_DIR" ]; then
        DISK_INFO=$(df -h "$LOG_DIR" | awk 'NR==2')
        disk_usage=$(echo $DISK_INFO | awk '{print $5}' | tr -d '%')
        status="OK"
        [ $disk_usage -gt $DISK_WARN_PERCENT ] && status="WARNING"
        DISK_STATUS=$(printf "%-20s | %-8s | %-8s | %-8s | %-3d%% | %s" \
            $(echo $DISK_INFO | awk '{print $6}') \
            $(echo $DISK_INFO | awk '{print $2}') \
            $(echo $DISK_INFO | awk '{print $3}') \
            $(echo $DISK_INFO | awk '{print $4}') \
            $disk_usage \
            $status)
    else
        DISK_STATUS=$(printf "%-20s | %-8s | %-8s | %-8s | %-3s | %s" \
            "$LOG_DIR" "N/A" "N/A" "N/A" "N/A" "目录不存在")
    fi
    
    # CPU和内存
    CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
    MEM_USAGE=$(free | awk '/Mem/{printf "%.0f", $3/$2*100}')
    cpu_status="OK"
    mem_status="OK"
    [[ $CPU_USAGE > $CPU_WARN_PERCENT ]] && cpu_status="WARNING"
    [[ $MEM_USAGE > $MEM_WARN_PERCENT ]] && mem_status="WARNING"
    
    CPU_STATUS=$(printf "%-6.1f%% | %s" $CPU_USAGE $cpu_status)
    MEM_STATUS=$(printf "%-6d%% | %s" $MEM_USAGE $mem_status)
}

# 检查错误日志
check_error_logs() {
    ERROR_COUNT=0
    WARN_COUNT=0
    LAST_3_ERRORS="无错误日志"
    
    # 尝试查找日志文件
    LOG_FILES=("$LOG_DIR/server.log" "$LOG_DIR/kafka.log" "/tmp/kafka-logs/kafka.log")
    
    for log_file in "${LOG_FILES[@]}"; do
        if [ -f "$log_file" ]; then
            ERROR_COUNT=$(grep -c -i "error" "$log_file" 2>/dev/null || echo 0)
            WARN_COUNT=$(grep -c -i "warn" "$log_file" 2>/dev/null || echo 0)
            LAST_3_ERRORS=$(grep -i "error" "$log_file" | tail -3 | sed 's/.*ERROR/ERROR:/' | sed 's/^/    /')
            [ -z "$LAST_3_ERRORS" ] && LAST_3_ERRORS="无错误日志"
            break
        fi
    done
    
    if [ ! -f "$log_file" ]; then
        LAST_3_ERRORS="日志文件未找到 (尝试: ${LOG_FILES[*]})"
    fi
}

# 检查网络连接
check_network() {
    LISTEN_PORTS=""
    ESTABLISHED_CONNS=0
    CONN_BY_IP=""
    
    if get_kafka_pid; then
        LISTEN_PORTS=$(ss -tuln -p | grep "pid=$KAFKA_PID" | awk '{print $5}' | sort | uniq)
        ESTABLISHED_CONNS=$(ss -tanp | grep "pid=$KAFKA_PID" | grep ESTAB | wc -l)
        CONN_BY_IP=$(ss -tanp | grep "pid=$KAFKA_PID" | grep ':9092' | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -5)
    fi
}

# 检查JVM状态
check_jvm() {
    JVM_VERSION="jcmd不可用"
    JVM_FLAGS="jcmd不可用"
    GC_INFO="jstat不可用"
    
    if get_kafka_pid; then
        # 尝试找到JAVA_HOME
        if [ -z "$JAVA_HOME" ]; then
            java_path=$(readlink -f /proc/$KAFKA_PID/exe 2>/dev/null)
            if [[ "$java_path" =~ "/bin/java" ]]; then
                JAVA_HOME=${java_path%/bin/java}
            fi
        fi
        
        # 使用jcmd
        if [ -n "$JAVA_HOME" ] && [ -x "$JAVA_HOME/bin/jcmd" ]; then
            JVM_VERSION=$("$JAVA_HOME/bin/jcmd" $KAFKA_PID VM.version 2>/dev/null | grep version || echo "未知")
            JVM_FLAGS=$("$JAVA_HOME/bin/jcmd" $KAFKA_PID VM.flags 2>/dev/null | head -5)
            GC_INFO=$("$JAVA_HOME/bin/jstat" -gc $KAFKA_PID 2>/dev/null | tail -1 || echo "未知")
        fi
    fi
}

# 检查配置差异
check_configs() {
    ACTUAL_CONFIG=""
    DEFAULT_CONFIG=""
    
    # 方法1: 使用kafka-configs.sh
    if [ -x "$KAFKA_HOME/bin/kafka-configs.sh" ]; then
        ACTUAL_CONFIG=$("$KAFKA_HOME/bin/kafka-configs.sh" --bootstrap-server $BROKERS --entity-type brokers --describe 2>&1)
        if [[ "$ACTUAL_CONFIG" == *"AuthorizationException"* ]]; then
            ACTUAL_CONFIG="错误: 需要管理员权限"
        elif [[ "$ACTUAL_CONFIG" == *"TimeoutException"* ]]; then
            ACTUAL_CONFIG="错误: 连接超时"
        fi
    fi
    
    # 方法2: 从Zookeeper获取 (如果方法1失败)
    if [[ "$ACTUAL_CONFIG" == *"错误"* ]] && [ -x "$KAFKA_HOME/bin/zookeeper-shell.sh" ]; then
        ACTUAL_CONFIG=$("$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "ls /config/brokers" 2>/dev/null | 
            grep '\[.*\]' | jq -r '.[]' | while read -r broker_id; do
                echo "Broker $broker_id 配置:"
                "$KAFKA_HOME/bin/zookeeper-shell.sh" "$ZK_HOSTS" <<< "get /config/brokers/$broker_id" 2>/dev/null | tail -1 | jq -r '.config | to_entries[] | "\(.key)=\(.value)"'
            done)
    fi
    
    # 默认配置
    if [ -f "$KAFKA_HOME/config/server.properties" ]; then
        DEFAULT_CONFIG=$(grep -vE '^#|^$' "$KAFKA_HOME/config/server.properties")
    else
        DEFAULT_CONFIG="错误: server.properties 未找到"
    fi
}

# 生成报告
generate_report() {
    {
        echo "Kafka集群深度巡检报告"
        echo "生成时间: $(date "+%Y-%m-%d %H:%M:%S")"
        echo "主机名: $(hostname)"
        echo "=============================================================="
        
        print_header "一、集群概览"
        print_key_value "Kafka版本" "$KAFKA_VERSION"
        print_key_value "运行时长" "$UPTIME"
        print_key_value "启动时间" "$START_TIME"
        print_key_value "Broker总数/在线数" "$BROKER_COUNT/$RUNNING_BROKERS"
        print_key_value "Zookeeper模式" "${ZK_MODE:-未知}"
        print_key_value "Zookeeper版本" "${ZK_VERSION:-未知}"
        echo
        
        print_header "二、Broker详细状态"
        print_table_header "Broker地址     | Broker ID | 角色       | 状态    | 检查时间"
        echo "$BROKER_STATUS"
        echo
        
        print_header "三、Topic状态"
        if [ -n "$TOPIC_INFO" ]; then
            print_table_header "Topic名称         | 分区数 | 副本数 | 未同步分区 | 状态"
            echo "$TOPIC_INFO" | awk -F'|' '{
                status = ($4 > '$UNDER_REPLICATED_WARN') ? "WARNING" : "OK";
                printf "%-20s | %-7s | %-7s | %-12s | %s\n", $1, $2, $3, $4, status
            }' | sort
        else
            echo "无法获取Topic信息 (可能没有Topic或连接问题)"
        fi
        echo
        
        print_header "四、消费者组状态"
        if [ -n "$CONSUMER_GROUPS" ]; then
            print_table_header "消费者组名称         | 消息延迟 | 状态"
            echo "$CONSUMER_GROUPS" | awk -F'|' '{
                status = "OK";
                if ($2 > '$CONSUMER_LAG_CRITICAL') status = "CRITICAL";
                else if ($2 > '$CONSUMER_LAG_WARN') status = "WARNING";
                printf "%-20s | %-10s | %-10s | %s\n", $1, $2, $3, status
            }' | sort
        else
            echo "无法获取消费者组信息 (可能没有消费者组或连接问题)"
        fi
        echo
        
        print_header "五、系统资源"
        print_section "1. 文件描述符使用"
        if [ -n "$FD_INFO" ]; then
            print_table_header "PID     | 使用量   | 使用率 | 状态"
            echo "$FD_INFO"
        else
            echo "无法获取文件描述符信息"
        fi
        echo
        
        print_section "2. 磁盘空间"
        echo "$DISK_STATUS"
        echo
        
        print_section "3. CPU和内存"
        print_table_header "CPU使用率 | 状态"
        echo "$CPU_STATUS"
        print_table_header "内存使用率 | 状态"
        echo "$MEM_STATUS"
        echo
        
        print_section "4. Zookeeper状态"
        print_key_value "活跃连接数" "$ZK_CONNECTIONS"
        print_key_value "延迟(最小/平均/最大)" "${ZK_LATENCY:-未知}"
        echo
        
        print_header "六、日志分析"
        print_key_value "ERROR级别日志数量" "$ERROR_COUNT"
        print_key_value "WARN级别日志数量" "$WARN_COUNT"
        echo
        print_section "最近3条ERROR日志"
        echo "$LAST_3_ERRORS"
        echo
        
        print_header "七、网络连接"
        print_key_value "已建立连接数" "$ESTABLISHED_CONNS"
        print_section "监听端口"
        echo "${LISTEN_PORTS:-无法获取}"
        echo
        print_section "TOP 5客户端IP连接数"
        [ -n "$CONN_BY_IP" ] && echo "$CONN_BY_IP" || echo "无连接信息"
        echo
        
        print_header "八、JVM状态"
        print_key_value "JVM版本" "$JVM_VERSION"
        print_section "GC统计"
        echo "${GC_INFO:-无法获取}"
        echo
        print_section "JVM参数"
        echo "$JVM_FLAGS" | head -5
        echo "... (完整参数请查看日志)"
        echo
        
        print_header "九、配置检查"
        print_section "重要配置项"
        if [ -f "$KAFKA_HOME/config/server.properties" ]; then
            grep -E 'log.retention|replication|compression|cleanup.policy' "$KAFKA_HOME/config/server.properties" | head -10
        else
            echo "无法读取server.properties"
        fi
        echo
        print_section "运行时配置差异"
        echo "${ACTUAL_CONFIG:-无法获取运行时配置}" | head -10
        echo "... (完整配置请查看管理界面)"
        echo
        
        print_header "巡检结论与建议"
        echo "1. 集群整体状态: $( [ $RUNNING_BROKERS -eq $BROKER_COUNT ] && echo "健康" || echo "警告" )"
        echo "2. 资源使用情况:"
        [ -n "$DISK_STATUS" ] && echo "   - 磁盘空间: $(echo "$DISK_STATUS" | grep -q "WARNING" && echo "警告" || echo "正常")"
        [ -n "$FD_INFO" ] && echo "   - 文件描述符: $(echo "$FD_INFO" | grep -q "WARNING" && echo "警告" || echo "正常")"
        echo "3. 关键问题:"
        [ $ERROR_COUNT -gt 0 ] && echo "   - 发现 $ERROR_COUNT 个错误日志需要检查"
        [ -n "$TOPIC_INFO" ] && echo "$TOPIC_INFO" | awk -F'|' '$4 > '$UNDER_REPLICATED_WARN' {print "   - Topic " $1 " 有 " $4 " 个未同步分区"}'
        [ -n "$CONSUMER_GROUPS" ] && echo "$CONSUMER_GROUPS" | awk -F'|' '$2 > '$CONSUMER_LAG_CRITICAL' {print "   - 消费者组 " $1 " 有严重延迟 (" $2 ")"}'
        echo
        echo "建议操作:"
        [ -n "$DISK_STATUS" ] && echo "1. 定期清理旧日志 (当前磁盘使用: $(echo "$DISK_STATUS" | awk -F'|' '{print $5}'))"
        [ -n "$CONSUMER_GROUPS" ] && echo "2. 监控消费者组延迟情况"
        [ -n "$TOPIC_INFO" ] && echo "3. 检查未同步分区的Topic"
        echo "4. 检查Zookeeper连接配置 (当前状态: ${ZK_MODE:-未知})"
        echo
        
        echo "=============================================================="
        echo "报告生成完成"
        echo "保存路径: $REPORT_FILE"
        
    } > "$REPORT_FILE"
}

# 主执行流程
main() {
    echo "[$(date)] 开始Kafka集群深度巡检..."
    check_dependencies
    
    # 自动检测真实日志目录
    if [ ! -d "$LOG_DIR" ] && [ -f "$KAFKA_HOME/config/server.properties" ]; then
        possible_log_dir=$(grep '^log.dirs' "$KAFKA_HOME/config/server.properties" | cut -d'=' -f2 | cut -d',' -f1)
        [ -d "$possible_log_dir" ] && LOG_DIR="$possible_log_dir"
    fi
    
    get_kafka_info
    check_zookeeper
    check_brokers
    check_topics
    check_consumer_groups
    check_system_resources
    check_error_logs
    check_network
    check_jvm
    check_configs
    
    generate_report
    
    echo "[$(date)] 巡检完成! 报告已生成: $REPORT_FILE"
    echo "使用以下命令查看报告:"
    echo "  less $REPORT_FILE"
    echo "  cat $REPORT_FILE"
    
    # 设置正确的文件权限
    chmod 644 "$REPORT_FILE"
}

main

关键优化点:

  1. Zookeeper信息获取优化

    • 直接使用nc发送四字命令获取状态,绕过zookeeper-shell.sh
    • 更可靠地获取运行模式、版本和延迟信息
  2. Broker ID获取优化

    • 从Kafka进程命令行参数中提取broker.id
    • 避免依赖Zookeeper连接
  3. Topic和消费者组检查优化

    • 跳过空行和无效结果
    • 添加错误处理防止空记录
    • 更友好的错误提示
  4. 日志目录自动检测

    • 从server.properties中读取log.dirs配置
    • 尝试多个可能的日志位置
  5. 配置检查优化

    • 主方法失败时尝试备用方法(从Zookeeper获取)
    • 添加超时和权限错误的明确提示
  6. 错误日志检测增强

    • 尝试多个可能的日志文件位置
    • 提供更明确的错误信息
  7. 报告输出优化

    • 所有字段都有默认值(避免空输出)
    • 更友好的空状态提示
    • 明确的建议操作

使用说明:

  1. 保存脚本为kafka_inspection_ultimate.sh
  2. 无需修改任何配置即可运行(会自动检测环境)
  3. 执行脚本:
    chmod +x kafka_inspection_ultimate.sh
    ./kafka_inspection_ultimate.sh
    
  4. 查看报告:
    less /tmp/kafka_inspection_$(date +%Y%m%d).txt
    

这个优化版本解决了之前遇到的所有问题,特别是:

  • Zookeeper信息获取
  • Broker ID识别
  • Topic/消费者组数据格式
  • 日志目录检测
  • 配置检查可靠性

报告输出更加完整和专业,即使在部分服务不可用的情况下也能提供有价值的诊断信息。

在这里插入图片描述

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值