实现功能:1.监控sparkstreaming任务进程是否存在,不存在调起任务
2.监控日志文件,监控脚本通过Crontab一分钟调度一次,每次执行获取两分钟内的日志信息,匹配ERROR,发邮件
config.txt文件(里面有任务名称,任务执行脚本,任务日志路径)
###############################################################################
# Function : 实时任务监控任务配置文件
# Author : ***
# Created Time : 2019-05-09 14:00:00
#
# Format : task_name script_name log_file
# Comment1 :
###############################################################################
#
***.jar ***.sh ***.log
任务调度脚本(监控任务存在,不存再调任务执行脚本,监测ERROR信息).sh
###############################################################################
# Function : 实时任务监控(服务与日志)
# Author : ***
# Mail : ***
# Created Time : 2019-05-09 14:00:00
#
# Params :
###############################################################################
#! /bin/bash
source /etc/profile
current_dir=$(cd $(dirname $0); pwd)
cd ${current_dir}
readonly SOURCE_FILE="${current_dir}/config.txt"
readonly MAIL_RECEIVER="***"
# 防止按小时或者按天切割的日志错过获取机会
TIME_STAMP=$(date -d "-5 sec" +%s)
# 实时任务的日志路径
LOG_DIR="/data/realtime_compute_streaming/logs"
if [ ! -d ${LOG_DIR} ]; then
mkdir -p ${LOG_DIR}
fi
# 截取日志的关键字
ERROR="ERROR|EXCEPTION"
# 截取后的日志存放在这
ERR_LOG="${LOG_DIR}/spark_streaming_error_${TIME_STAMP}.log"
TMP_LOG="${LOG_DIR}/spark_streaming_tmp_${TIME_STAMP}.log"
# 截取的时间范围数字 单位分钟
DATE_INTERVAL=3
# 获取当前时间变量
CUR_TIME=$(date "+%Y-%m-%d %H:%M:%S")
NEW_TIME=$(date "+%y\/%m\/%d\ %H\:%M")
LAST_TIME=$(date -d "-${DATE_INTERVAL} min" "+%Y-%m-%d %H:%M:%S")
OLD_TIME=$(date -d "-${DATE_INTERVAL} min" "+%y\/%m\/%d\ %H\:%M")
function service_monitor()
{
echo "--> start realtime_monitor"
while read line
do
if [[ ${line:0:1} == "#" ]]; then
continue
fi
#获取config.txt中相应字段
var_task=$(echo ${line} | awk -F ' ' '{print $1}')
var_runsh=$(echo ${line} | awk -F ' ' '{print $2}')
var_log=$(echo ${line} | awk -F ' ' '{print $3}')
echo ${var_runsh}
#判断任务是否存在/去除grep后有且仅有一个
is_task_exist=$(ps aux | grep ${var_task} | grep -v grep | wc -l)
echo "is_task_exist: ${is_task_exist}"
if [ $is_task_exist = 0 ]; then
echo "--> Process ${var_task} is down"
echo "--> Bring ${var_task} up"
strDate=$(date +"%Y-%m-%d-%H-%M-%S")
strStart="${var_task} failed, start at ${strDate}"
if [[ ! -f "${LOG_DIR}/spark_streaming_service_log.txt" || ! -f "${LOG_DIR}/${var_log}" ]]; then
touch ${LOG_DIR}/spark_streaming_service_log.txt
touch ${LOG_DIR}/${var_log}
fi
echo "-!-> ${strStart}" >> ${LOG_DIR}/${var_log}
nohup sh /data/realtime_compute_streaming/submit_job/job_cmd/${var_runsh} > ${LOG_DIR}/${var_log} 2>&1 &
sleep 5
job_schedu_status=$(ps aux | grep ${var_task} | grep -v grep | wc -l)
echo $job_schedu_status
#判断任务是否被重新调度成功
if [[ $job_schedu_status -eq 1 ]]; then
echo "Bring task success!"
echo "Bring ${var_task} at ${strDate} success!!!" > ${LOG_DIR}/spark_streaming_service_log.txt
python ../tools/sendmail_alarm.py "--> ${var_task} failed, Bring success" ${LOG_DIR}/spark_streaming_service_log.txt ${MAIL_RECEIVER}
else
echo "Bring task failed"
echo "Bring ${var_task} at ${strDate} failed,please remind related worker schedule the task " > ${LOG_DIR}/spark_streaming_service_log.txt
python ../tools/sendmail_alarm.py "--> ${var_task} failed,can't bring up" ${LOG_DIR}/spark_streaming_service_log.txt ${MAIL_RECEIVER}
fi
echo "--> Bring ${var_task} finished"
else
strDate=$(date +"%Y-%m-%d %H:%M:%S")
strRun="${var_task} running ${strDate}"
echo "--> ${strRun}"
fi
#任务正常执行时获取两分钟内日志,匹配是否有error信息
if [[ $is_task_exist -eq 1 ]]; then
echo "--> whiteBoard task, run log_monitor"
log_monitor ${var_log} ${var_task}
fi
done < ${SOURCE_FILE}
}
# 白板日志监控
function log_monitor()
{
local log_file=$1
local task_name=$2
# 提取和过滤日志
if [ -f ${LOG_DIR}/${log_file} ]
# if [ -f ${log_file} ]
then
echo "${LOG_DIR}/${log_file}"
echo "sed -n "/${OLD_TIME}:[0-9][0-9]/,/${NEW_TIME}:[0-9][0-9]/p" ${LOG_DIR}/${log_file} > ${TMP_LOG}"
# 匹配时间段内的日志
sed -n "/${OLD_TIME}:[0-9][0-9]/,/${NEW_TIME}:[0-9][0-9]/p" ${LOG_DIR}/${log_file} > ${TMP_LOG}
if [ ! -s ${TMP_LOG} ]; then
echo "--> error: ${task_name} does not generate log from ${LAST_TIME} to ${CUR_TIME}"
python ../tools/sendmail_alarm.py "--> error: ${task_name} does not generate log near ${CUR_TIME}" ${TMP_LOG} ${MAIL_RECEIVER}
else
grep -i -E -A15 -B15 "${ERROR}" ${TMP_LOG} > ${ERR_LOG}
if [ -s ${ERR_LOG} ]; then
echo "--> error: ${task_name} occurs an error/exception near ${CUR_TIME}"
python ../tools/sendmail_alarm.py "--> error: ${task_name} occurs an error/exception" ${ERR_LOG} ${MAIL_RECEIVER}
else
echo "--> success: ${task_name} is running perfect!"
fi
fi
rm -f ${TMP_LOG}
rm -f ${ERR_LOG}
else
echo "--> there is no log file"
echo "--> there is no log file" > ${TMP_LOG}
python ../tools/sendmail_alarm.py "--> error: there is no log file++" ${TMP_LOG} ${MAIL_RECEIVER}
fi
}
service_monitor