日常巡检核心脚本
检查服务器CPU、内存、磁盘、网络、进程等关键指标,生成带颜色的报告,异常指标自动标黄/红。
#!/bin/bash
# -------------------------- 配置区 --------------------------
HOSTNAME=$(hostname)
DATE=$(date +"%Y-%m-%d %H:%M:%S")
# 报告保存路径(不用改,默认存/var/log)
REPORT_FILE="/var/log/system_info_check_$(date +%Y%m%d_%H%M%S).log"
# 告警阈值(可根据服务器配置调整)
CPU_WARNING=80 # CPU使用率告警阈值(%)
MEM_WARNING=85 # 内存使用率告警阈值(%)
DISK_WARNING=85 # 磁盘使用率告警阈值(%)
LOAD_WARNING=4 # 系统负载阈值(CPU核心数*2,比如4核设8)
INODE_WARNING=80 # Inode使用率告警阈值(%)
# -------------------------------------------------------------------
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 日志函数
log() { echo "[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"; }
log_section() { echo -e "\n============================================================" | tee -a "$REPORT_FILE"; echo " $1" | tee -a "$REPORT_FILE"; echo "============================================================" | tee -a "$REPORT_FILE"; }
log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"; }
log_error() { echo -e "${RED}[ERROR] $1${NC}" | tee -a "$REPORT_FILE"; }
log_ok() { echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"; }
# 1. 系统基本信息
check_basic_info() {
log_section "1. 系统基本信息"
log "主机名: $HOSTNAME"
log "检查时间: $DATE"
log "系统版本: $(cat /etc/redhat-release 2>/dev/null || cat /etc/issue | head -1)"
log "内核版本: $(uname -r)"
log "系统架构: $(uname -m)"
log "运行时长: $(uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}')"
log "当前用户: $(whoami)"
log "登录用户数: $(who | wc -l)"
}
# 2. CPU检查
check_cpu() {
log_section "2. CPU使用率检查"
CPU_CORES=$(grep -c ^processor /proc/cpuinfo)
log "CPU核心数: $CPU_CORES"
# 获取CPU使用率(5秒平均值,更准确)
CPU_IDLE=$(top -bn2 -d 1 | grep "Cpu(s)" | tail -1 | awk '{print $8}' | cut -d'%' -f1)
CPU_USAGE=$(echo "scale=2; 100 - $CPU_IDLE" | bc)
log "CPU使用率: ${CPU_USAGE}%"
if (( $(echo "$CPU_USAGE > $CPU_WARNING" | bc -l) )); then
log_warning "CPU使用率超过${CPU_WARNING}%,当前${CPU_USAGE}%"
log "TOP 5 CPU消耗进程:"
ps aux | sort -rn -k3 | head -5 | awk '{printf " PID: %-8s User: %-10s CPU: %-6s CMD: %s\n", $2,$1,$3,$11}' | tee -a "$REPORT_FILE"
else
log_ok "CPU使用率正常: ${CPU_USAGE}%"
fi
# 系统负载检查
LOAD_1=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs)
LOAD_5=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $2}' | xargs)
LOAD_15=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $3}' | xargs)
log "系统负载: 1分钟=${LOAD_1}, 5分钟=${LOAD_5}, 15分钟=${LOAD_15}"
# 负载阈值=CPU核心数*2(比如4核设8)
LOAD_THRESHOLD=$(echo "$CPU_CORES * 2" | bc)
if (( $(echo "$LOAD_1 > $LOAD_THRESHOLD" | bc -l) )); then
log_warning "系统负载过高!1分钟负载${LOAD_1}超过阈值${LOAD_THRESHOLD}"
fi
}
# 3. 内存检查
check_memory() {
log_section "3. 内存使用检查"
MEM_TOTAL=$(free -m | awk 'NR==2{print $2}')
MEM_USED=$(free -m | awk 'NR==2{print $3}')
MEM_AVAILABLE=$(free -m | awk 'NR==2{print $7}')
MEM_USAGE=$(echo "scale=2; $MEM_USED / $MEM_TOTAL * 100" | bc)
log "内存总量: ${MEM_TOTAL}MB"
log "已用内存: ${MEM_USED}MB"
log "可用内存: ${MEM_AVAILABLE}MB"
log "内存使用率: ${MEM_USAGE}%"
if (( $(echo "$MEM_USAGE > $MEM_WARNING" | bc -l) )); then
log_warning "内存使用率超过${MEM_WARNING}%,当前${MEM_USAGE}%"
log "TOP 5 内存消耗进程:"
ps aux | sort -rn -k4 | head -5 | awk '{printf " PID: %-8s User: %-10s MEM: %-6s CMD: %s\n", $2,$1,$4,$11}' | tee -a "$REPORT_FILE"
else
log_ok "内存使用率正常: ${MEM_USAGE}%"
fi
# Swap检查(超过100MB预警)
SWAP_TOTAL=$(free -m | awk 'NR==3{print $2}')
SWAP_USED=$(free -m | awk 'NR==3{print $3}')
log "Swap总量: ${SWAP_TOTAL}MB"
log "Swap使用: ${SWAP_USED}MB"
if [ "$SWAP_TOTAL" -gt 0 ] && [ "$SWAP_USED" -gt 100 ]; then
log_warning "Swap使用量较高: ${SWAP_USED}MB,可能存在内存压力"
fi
}
# 4. 磁盘检查
check_disk() {
log_section "4. 磁盘使用检查"
log "磁盘分区使用情况:"
df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | awk '{print " " $0}' | tee -a "$REPORT_FILE"
# 磁盘使用率告警
HAS_DISK_WARNING=0
while read line; do
USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
MOUNT=$(echo $line | awk '{print $6}')
if [ "$USAGE" -gt "$DISK_WARNING" ]; then
log_warning "磁盘分区 $MOUNT 使用率${USAGE}%超过阈值${DISK_WARNING}%"
HAS_DISK_WARNING=1
log " $MOUNT 分区占用空间最大的5个目录:"
du -sh ${MOUNT}/* 2>/dev/null | sort -rh | head -5 | awk '{print " " $0}' | tee -a "$REPORT_FILE"
fi
done < <(df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop')
if [ $HAS_DISK_WARNING -eq 0 ]; then
log_ok "所有磁盘分区使用率正常"
fi
# Inode检查(避免小文件占满磁盘)
log -e "\nInode使用情况:"
df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop' | awk '{print " " $0}' | tee -a "$REPORT_FILE"
while read line; do
INODE_USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
MOUNT=$(echo $line | awk '{print $6}')
if [ "$INODE_USAGE" -gt "$INODE_WARNING" ]; then
log_warning "分区 $MOUNT 的Inode使用率${INODE_USAGE}%超过阈值${INODE_WARNING}%"
fi
done < <(df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop')
}
# 5. 网络检查
check_network() {
log_section "5. 网络状态检查"
log "网络接口状态:"
ip -br addr | awk '{print " " $0}' | tee -a "$REPORT_FILE"
# TCP连接统计
log -e "\nTCP连接状态统计:"
netstat -an | awk '/^tcp/ {print $6}' | sort | uniq -c | sort -rn | awk '{print " " $0}' | tee -a "$REPORT_FILE"
# TIME_WAIT过多预警(超过5000)
TIME_WAIT_COUNT=$(netstat -an | grep TIME_WAIT | wc -l)
log "TIME_WAIT连接数: $TIME_WAIT_COUNT"
if [ "$TIME_WAIT_COUNT" -gt 5000 ]; then
log_warning "TIME_WAIT连接数过多: $TIME_WAIT_COUNT(可优化TCP参数)"
fi
# 监听端口检查
log -e "\n当前监听端口:"
netstat -tuln | grep LISTEN | awk '{print " " $0}' | tee -a "$REPORT_FILE"
}
# 6. 进程和服务检查
check_processes() {
log_section "6. 进程和服务检查"
PROCESS_COUNT=$(ps aux | wc -l)
log "当前进程总数: $PROCESS_COUNT"
# 僵尸进程检查(死进程没清理)
ZOMBIE_COUNT=$(ps aux | awk '{print $8}' | grep -c Z)
log "僵尸进程数: $ZOMBIE_COUNT"
if [ "$ZOMBIE_COUNT" -gt 0 ]; then
log_warning "发现僵尸进程!"
ps aux | grep 'Z' | grep -v grep | awk '{print " PID: " $2 " PPID: " $3 " CMD: " $11}' | tee -a "$REPORT_FILE"
fi
# 关键服务检查(新手可加自己的服务,比如nginx/mysql)
log -e "\n关键服务状态检查:"
CRITICAL_SERVICES=("sshd" "crond" "rsyslog") # 可添加服务,如"nginx" "mysqld"
for service in "${CRITICAL_SERVICES[@]}"; do
if systemctl is-active --quiet $service 2>/dev/null; then
log_ok " $service: 运行中"
else
# 兼容非systemd系统(如CentOS 6)
if ps aux | grep -v grep | grep -q $service; then
log_ok " $service: 运行中"
else
log_error " $service: 未运行(需手动启动)"
fi
fi
done
}
# 7. 系统日志检查
check_logs() {
log_section "7. 系统日志检查"
log "最近1小时系统错误日志:"
if [ -f /var/log/messages ]; then
ERROR_COUNT=$(grep -i "error\|fail\|critical" /var/log/messages | tail -20 | wc -l)
if [ "$ERROR_COUNT" -gt 0 ]; then
log_warning "发现 $ERROR_COUNT 条错误日志"
grep -i "error\|fail\|critical" /var/log/messages | tail -10 | awk '{print " " $0}' | tee -a "$REPORT_FILE"
else
log_ok "无严重错误日志"
fi
fi
# OOM检查(内存溢出导致进程被杀)
log -e "\nOOM(内存溢出)检查:"
OOM_COUNT=$(dmesg | grep -i "out of memory" | wc -l)
if [ "$OOM_COUNT" -gt 0 ]; then
log_warning "发现 $OOM_COUNT 次OOM事件(需检查内存配置)"
dmesg | grep -i "out of memory" | tail -5 | awk '{print " " $0}' | tee -a "$REPORT_FILE"
else
log_ok "无OOM事件"
fi
}
# 8. 报告摘要
generate_summary() {
log_section "8. 巡检报告摘要"
WARNING_COUNT=$(grep -c "\[WARNING\]" "$REPORT_FILE")
ERROR_COUNT=$(grep -c "\[ERROR\]" "$REPORT_FILE")
log "巡检完成时间: $(date +"%Y-%m-%d %H:%M:%S")"
log "告警数量: $WARNING_COUNT"
log "错误数量: $ERROR_COUNT"
if [ "$ERROR_COUNT" -gt 0 ]; then
log_error "发现 $ERROR_COUNT 个严重问题,请立即处理!"
elif [ "$WARNING_COUNT" -gt 0 ]; then
log_warning "发现 $WARNING_COUNT 个告警,建议关注"
else
log_ok "系统状态良好,无异常"
fi
log -e "\n完整报告已保存至: $REPORT_FILE"
}
# 主函数
main() {
echo "=========================================="
echo " 服务器健康状态巡检脚本"
echo "=========================================="
echo -e "\n"
# 非root用户提示(部分检查需要root)
if [ "$(id -u)" -ne 0 ]; then
echo "警告: 非root用户运行,部分检查可能无法执行"
echo -e "\n"
fi
# 执行所有检查
check_basic_info
check_cpu
check_memory
check_disk
check_network
check_processes
check_logs
generate_summary
echo -e "\n=========================================="
echo " 巡检完成!"
echo "=========================================="
}
# 脚本入口
main "$@"
磁盘空间深度检查脚本
找出大文件、旧日志、临时文件,还能预测磁盘满的时间,给出清理建议。
#!/bin/bash
# -------------------------- 配置区 --------------------------
REPORT_FILE="/var/log/disk_check_$(date +%Y%m%d_%H%M%S).log"
DISK_WARNING=80 # 磁盘使用率告警阈值(%)
INODE_WARNING=80 # Inode使用率告警阈值(%)
LARGE_FILE_SIZE=1G # 大文件阈值(可改500M、2G)
# -------------------------------------------------------------------
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# 日志函数
log() { echo "[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"; }
log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"; }
log_error() { echo -e "${RED}[ERROR] $1${NC}" | tee -a "$REPORT_FILE"; }
log_ok() { echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"; }
# 1. 磁盘使用率检查
check_disk_usage() {
log "==========================================================="
log "1. 磁盘使用率检查"
log "==========================================================="
df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | while read line; do
USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
MOUNT=$(echo $line | awk '{print $6}')
AVAIL=$(echo $line | awk '{print $4}')
log "分区: $MOUNT"
log " 使用率: ${USAGE}%"
log " 可用空间: $AVAIL"
if [ "$USAGE" -gt "$DISK_WARNING" ]; then
log_error " 磁盘使用率超过${DISK_WARNING}%!"
# 显示TOP10大目录
log " 占用空间TOP 10目录:"
du -sh ${MOUNT}/* 2>/dev/null | sort -rh | head -10 | awk '{print " " $0}' | tee -a "$REPORT_FILE"
# 显示超阈值的大文件
log " 大文件(>$LARGE_FILE_SIZE):"
find ${MOUNT} -type f -size +${LARGE_FILE_SIZE} -exec ls -lh {} \; 2>/dev/null | awk '{print " " $9 " (" $5 ")"}' | head -10 | tee -a "$REPORT_FILE"
else
log_ok " 使用率正常"
fi
log ""
done
}
# 2. Inode检查(小文件占满磁盘用)
check_inode_usage() {
log "==========================================================="
log "2. Inode使用率检查"
log "==========================================================="
df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop' | while read line; do
INODE_USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
MOUNT=$(echo $line | awk '{print $6}')
log "分区: $MOUNT"
log " Inode使用率: ${INODE_USAGE}%"
if [ "$INODE_USAGE" -gt "$INODE_WARNING" ]; then
log_error " Inode使用率超过${INODE_WARNING}%!(小文件太多)"
# 显示文件数TOP10目录
log " 文件数量TOP 10目录:"
for dir in $(find ${MOUNT}/* -maxdepth 0 -type d 2>/dev/null); do
echo "$(find $dir -type f 2>/dev/null | wc -l) $dir"
done | sort -rn | head -10 | awk '{print " " $1 " files: " $2}' | tee -a "$REPORT_FILE"
else
log_ok " Inode使用率正常"
fi
log ""
done
}
# 3. 日志文件检查
check_log_files() {
log "==========================================================="
log "3. 日志文件大小检查"
log "==========================================================="
# 常见日志目录(可加自己的日志目录,如/opt/logs)
LOG_DIRS=("/var/log" "/opt/logs" "/data/logs" "/app/logs")
for log_dir in "${LOG_DIRS[@]}"; do
if [ -d "$log_dir" ]; then
log "检查目录: $log_dir"
# 显示超100MB的日志
log " 大日志文件(>100MB):"
find $log_dir -type f -size +100M -exec ls -lh {} \; 2>/dev/null | awk '{print " " $9 " (" $5 ")"}' | head -10 | tee -a "$REPORT_FILE"
# 日志总大小
TOTAL_SIZE=$(du -sh $log_dir 2>/dev/null | awk '{print $1}')
log " 总大小: $TOTAL_SIZE"
log ""
fi
done
# 30天前未压缩的旧日志
log "未压缩的旧日志文件(30天前):"
find /var/log -type f -name "*.log" -mtime +30 ! -name "*.gz" -exec ls -lh {} \; 2>/dev/null | awk '{print " " $9 " (" $5 ")"}' | head -10 | tee -a "$REPORT_FILE"
}
# 4. 临时文件检查
check_temp_files() {
log "==========================================================="
log "4. 临时文件检查"
log "==========================================================="
# /tmp目录检查
if [ -d "/tmp" ]; then
TMP_SIZE=$(du -sh /tmp 2>/dev/null | awk '{print $1}')
TMP_FILE_COUNT=$(find /tmp -type f 2>/dev/null | wc -l)
log "/tmp目录:"
log " 总大小: $TMP_SIZE"
log " 文件数: $TMP_FILE_COUNT"
# 7天未使用的临时文件
OLD_TMP=$(find /tmp -type f -atime +7 2>/dev/null | wc -l)
if [ "$OLD_TMP" -gt 0 ]; then
log_warning " 发现 $OLD_TMP 个超过7天未访问的临时文件"
log " 占用空间TOP 10:"
find /tmp -type f -atime +7 -exec ls -lh {} \; 2>/dev/null | sort -k5 -rh | head -10 | awk '{print " " $9 " (" $5 ")"}' | tee -a "$REPORT_FILE"
fi
fi
# /var/tmp检查
if [ -d "/var/tmp" ]; then
VAR_TMP_SIZE=$(du -sh /var/tmp 2>/dev/null | awk '{print $1}')
log -e "\n/var/tmp目录:"
log " 总大小: $VAR_TMP_SIZE"
fi
}
# 5. 磁盘IO检查
check_disk_io() {
log "==========================================================="
log "5. 磁盘IO统计"
log "==========================================================="
if command -v iostat &> /dev/null; then
log "磁盘IO统计(最近1分钟):"
iostat -x 1 2 | tail -n +4 | awk 'NF' | tail -n +2 | tee -a "$REPORT_FILE"
else
log_warning "未安装iostat工具(执行:yum install sysstat 安装)"
fi
# IO等待过高预警(超过20%)
IO_WAIT=$(top -bn1 | grep "Cpu(s)" | awk '{print $10}' | sed 's/%wa,//')
log -e "\n当前IO等待: ${IO_WAIT}%"
if (( $(echo "$IO_WAIT > 20" | bc -l) )); then
log_warning "IO等待过高!(可能是磁盘性能不足)"
fi
}
# 6. 清理建议
generate_cleanup_suggestions() {
log "==========================================================="
log "6. 磁盘清理建议(新手复制命令执行)"
log "==========================================================="
log "1. 压缩30天前的旧日志:"
log " find /var/log -type f -name '*.log' -mtime +30 -exec gzip {} \;"
log -e "\n2. 清理7天前的临时文件:"
log " find /tmp -type f -atime +7 -delete"
log -e "\n3. 清理yum缓存(CentOS):"
log " yum clean all"
log -e "\n4. 清理Docker未使用资源(如有Docker):"
log " docker system prune -a"
log -e "\n5. 清理journal日志(系统日志):"
log " journalctl --vacuum-time=7d"
}
# 主函数
main() {
echo "=========================================="
echo " 磁盘空间深度检查脚本 "
echo "=========================================="
echo -e "\n"
check_disk_usage
check_inode_usage
check_log_files
check_temp_files
check_disk_io
generate_cleanup_suggestions
log -e "\n==========================================================="
log "检查完成!"
log "报告保存至: $REPORT_FILE"
log "==========================================================="
}
main "$@"
网络连接检查脚本
检查端口监听、TCP连接状态(比如TIME_WAIT过多)、网络连通性(DNS/网关/外网)。
#!/bin/bash
# -------------------------- 配置区 --------------------------
REPORT_FILE="/var/log/network_check_$(date +%Y%m%d_%H%M%S).log"
TIME_WAIT_WARNING=5000 # TIME_WAIT超过5000预警
ESTABLISHED_WARNING=10000 # 活动连接超过10000预警
CRITICAL_PORTS=(22 80 443 3306 6379 8080) # 待检查端口(如MySQL:3306,Redis:6379)
# -------------------------------------------------------------------
# 颜色和日志函数
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log() { echo "[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"; }
log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"; }
log_ok() { echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"; }
# 关键端口检查
check_listening_ports() {
log "==========================================================="
log "3. 关键端口检查"
log "==========================================================="
log "当前监听端口:"
netstat -tulnp | grep LISTEN | tee -a "$REPORT_FILE"
log -e "\n关键端口状态:"
for port in "${CRITICAL_PORTS[@]}"; do
if netstat -tuln | grep -q ":$port "; then
PID=$(netstat -tulnp 2>/dev/null | grep ":$port " | awk '{print $7}' | cut -d'/' -f1)
PROCESS=$(netstat -tulnp 2>/dev/null | grep ":$port " | awk '{print $7}' | cut -d'/' -f2)
log_ok "端口 $port: 监听中(进程:$PROCESS PID:$PID)"
else
log_warning "端口 $port: 未监听(服务可能未启动)"
fi
done
}
main() {
echo "=========================================="
echo " 网络状态检查脚本 "
echo "=========================================="
check_network_interfaces
check_tcp_connections
check_listening_ports
check_network_connectivity
log -e "\n检查完成! 报告: $REPORT_FILE"
}
main "$@"
1786

被折叠的 条评论
为什么被折叠?



