shell脚本:sparkstreaming local模式下的任务调度和error信息邮件预警

本文介绍了一个用于监控SparkStreaming任务运行状态和日志的脚本,该脚本能够检测任务进程是否存在,若不存在则自动启动,并定时检查日志以发现错误信息,通过邮件通知管理员。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

实现功能:1.监控sparkstreaming任务进程是否存在,不存在调起任务

                  2.监控日志文件,监控脚本通过Crontab一分钟调度一次,每次执行获取两分钟内的日志信息,匹配ERROR,发邮件

config.txt文件(里面有任务名称,任务执行脚本,任务日志路径)

###############################################################################
# Function : 实时任务监控任务配置文件
# Author : ***
# Created Time : 2019-05-09 14:00:00
#
# Format : task_name script_name log_file
# Comment1 : 
###############################################################################
#
***.jar ***.sh ***.log

任务调度脚本(监控任务存在,不存再调任务执行脚本,监测ERROR信息).sh

###############################################################################
# Function : 实时任务监控(服务与日志)
# Author : ***
# Mail : ***
# Created Time : 2019-05-09 14:00:00
#
# Params : 
###############################################################################
#! /bin/bash

source /etc/profile

current_dir=$(cd $(dirname $0); pwd)
cd ${current_dir}

readonly SOURCE_FILE="${current_dir}/config.txt"
readonly MAIL_RECEIVER="***"

# 防止按小时或者按天切割的日志错过获取机会
TIME_STAMP=$(date -d "-5 sec" +%s)

# 实时任务的日志路径
LOG_DIR="/data/realtime_compute_streaming/logs"
if [ ! -d ${LOG_DIR} ]; then
    mkdir -p ${LOG_DIR}
fi

# 截取日志的关键字
 ERROR="ERROR|EXCEPTION"

# 截取后的日志存放在这
 ERR_LOG="${LOG_DIR}/spark_streaming_error_${TIME_STAMP}.log"
 TMP_LOG="${LOG_DIR}/spark_streaming_tmp_${TIME_STAMP}.log"

# 截取的时间范围数字  单位分钟
 DATE_INTERVAL=3

# 获取当前时间变量
CUR_TIME=$(date "+%Y-%m-%d %H:%M:%S")
NEW_TIME=$(date "+%y\/%m\/%d\ %H\:%M")
LAST_TIME=$(date -d "-${DATE_INTERVAL} min" "+%Y-%m-%d %H:%M:%S")
OLD_TIME=$(date -d "-${DATE_INTERVAL} min" "+%y\/%m\/%d\ %H\:%M")

function service_monitor()
{
    echo "--> start realtime_monitor"
    while read line
    do
        if [[ ${line:0:1} == "#" ]]; then
            continue
        fi

  #获取config.txt中相应字段
        var_task=$(echo ${line} | awk -F ' ' '{print $1}')
        var_runsh=$(echo ${line} | awk -F ' ' '{print $2}')
        var_log=$(echo ${line} | awk -F ' ' '{print $3}')
       
	echo ${var_runsh}
 #判断任务是否存在/去除grep后有且仅有一个
        is_task_exist=$(ps aux | grep ${var_task} | grep -v grep | wc -l)
        echo "is_task_exist: ${is_task_exist}" 
        if [ $is_task_exist = 0 ]; then
            echo "--> Process ${var_task} is down" 
            echo "--> Bring ${var_task} up"
    
            strDate=$(date +"%Y-%m-%d-%H-%M-%S")
            strStart="${var_task} failed, start at ${strDate}"
            
            if [[ ! -f "${LOG_DIR}/spark_streaming_service_log.txt" || ! -f "${LOG_DIR}/${var_log}" ]]; then
                touch ${LOG_DIR}/spark_streaming_service_log.txt
                touch ${LOG_DIR}/${var_log}
            fi
    
            echo "-!-> ${strStart}" >> ${LOG_DIR}/${var_log}

            nohup sh /data/realtime_compute_streaming/submit_job/job_cmd/${var_runsh} > ${LOG_DIR}/${var_log} 2>&1 &
           
	    sleep 5

	    job_schedu_status=$(ps aux | grep ${var_task} | grep -v grep | wc -l)		

	    echo $job_schedu_status	
#判断任务是否被重新调度成功
            if [[ $job_schedu_status -eq 1 ]]; then
		echo "Bring task success!"
                echo "Bring ${var_task} at ${strDate} success!!!" > ${LOG_DIR}/spark_streaming_service_log.txt
                python ../tools/sendmail_alarm.py "--> ${var_task} failed, Bring success" ${LOG_DIR}/spark_streaming_service_log.txt ${MAIL_RECEIVER}
            else
		echo "Bring task failed"
                echo "Bring ${var_task} at ${strDate} failed,please remind related worker schedule the task " > ${LOG_DIR}/spark_streaming_service_log.txt
                python ../tools/sendmail_alarm.py "--> ${var_task} failed,can't bring up" ${LOG_DIR}/spark_streaming_service_log.txt ${MAIL_RECEIVER}
            fi
    
            echo "--> Bring ${var_task} finished"  
        else 
            strDate=$(date +"%Y-%m-%d %H:%M:%S")
            strRun="${var_task} running ${strDate}"
            echo "--> ${strRun}" 
        fi
#任务正常执行时获取两分钟内日志,匹配是否有error信息
       if [[ $is_task_exist -eq 1 ]]; then
           echo "--> whiteBoard task, run log_monitor"
           log_monitor ${var_log} ${var_task}
       fi

    done < ${SOURCE_FILE}
}

# 白板日志监控
function log_monitor()
{
    local log_file=$1
    local task_name=$2
    # 提取和过滤日志
    if [ -f ${LOG_DIR}/${log_file} ]
  #  if [ -f ${log_file} ]
    then
        echo "${LOG_DIR}/${log_file}"
        echo "sed -n "/${OLD_TIME}:[0-9][0-9]/,/${NEW_TIME}:[0-9][0-9]/p" ${LOG_DIR}/${log_file} > ${TMP_LOG}"
        # 匹配时间段内的日志
        sed -n "/${OLD_TIME}:[0-9][0-9]/,/${NEW_TIME}:[0-9][0-9]/p" ${LOG_DIR}/${log_file} > ${TMP_LOG}
        if [ ! -s ${TMP_LOG} ]; then
            echo "--> error: ${task_name} does not generate log from ${LAST_TIME} to ${CUR_TIME}"
            python ../tools/sendmail_alarm.py "--> error: ${task_name} does not generate log near ${CUR_TIME}" ${TMP_LOG} ${MAIL_RECEIVER}
        else
            grep -i -E -A15 -B15 "${ERROR}" ${TMP_LOG} > ${ERR_LOG}
            if [ -s ${ERR_LOG} ]; then
                echo "--> error: ${task_name} occurs an error/exception near ${CUR_TIME}"
                python ../tools/sendmail_alarm.py "--> error: ${task_name} occurs an error/exception" ${ERR_LOG} ${MAIL_RECEIVER} 
            else
                echo "--> success: ${task_name} is running perfect!"
            fi
        fi
        rm -f ${TMP_LOG}
        rm -f ${ERR_LOG}
    else
        echo "--> there is no log file"
        echo "--> there is no log file" > ${TMP_LOG}
        python ../tools/sendmail_alarm.py "--> error: there is no log file++" ${TMP_LOG} ${MAIL_RECEIVER}
    fi
}

service_monitor

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值