Nimbus和Supervisor都是快速失败,无状态的进程,Nimbus的单点问题一直没有很好的解决办法,所以我们可以对相关进程进行监控,在其挂掉时尝试重启。
在之前的项目里,比较常用的方式是通过monit对相关进程进行监控,通过monit监控需要对每台机器进行配置,可以通过Fabric(http://www.fabfile.org)进行统一配置,这里不详细介绍monit监控的方式。
通过Shell脚本可以实现在Nimbus节点上,监控整个集群,前提是需要Nimbus节点与Supervisor建立SSH无密码访问。具体内容如下
main.sh
#!bin/bash
dir=`dirname $0`
while [ 1 ]
do
echo "========== "`date`" ==============="
nid=`jps -l|grep 'nimbus'|awk '{print $1}'`
if [ "$nid" = "" ]; then
echo 'storm nimbus is dead!'
echo 'trying to start nimbus...'
nohup storm nimbus >nimbus.log &
echo 'finish starting!'
else
echo "storm nimbus id: $nid"
fi
uid=`jps -l|grep 'backtype.storm.ui.core'|awk '{print $1}'`
if [ "$nid" = "" ]; then
echo 'storm ui process is dead!'
echo 'trying to start storm ui'
nohup storm ui >ui.log &
echo 'finish starting storm ui!'
else
echo "storm ui id: $uid"
fi
sh $dir/storm_manager.sh start
echo "sleeping 20s..."
sleep 20
done
<pre name="code" class="plain" style="font-size: 18px;">storm_manager.sh
#!bin/bash
slaves="cdn36 cdn37 cdn39 cdn21 cdn22 cdn23"
storm_dir='/data/tmp/storm'
check_supervisors(){
for node in $slaves
do
ssh $node <<END
source /etc/profile
source ~/.bash_profile
echo "=== check supervisor on $node..."
sid=\`jps |grep supervisor |awk '{print \$1}'\`
if [ "\$sid" = "" ] ;then
echo "supervisor is dead!"
else
echo "supervisor process id: \$sid"
fi
echo "finishing checking $node's supervisor"
echo
END
done
}
stop_supervisor(){
for node in $slaves
do
ssh $node <<END
source /etc/profile
source ~/.bash_profile
echo "=== killing supervisor on $node..."
jps |grep 'supervisor' |awk '{print \$1}' |xargs kill
echo "finishing killing $node's supervisor"
END
done
}
start_supervisor(){
for node in $slaves
do
ssh $node <<END
source /etc/profile
source ~/.bash_profile
sid=\`jps |grep supervisor |awk '{print \$1}'\`
echo "=== starting supervisor on $node..."
if [ "\$sid" = "" ] ;then
echo "supervisor is dead!"
mkdir -p ~/rzx
rm -fr $storm_dir/supervisor
cd ~/rzx
nohup storm supervisor >supervisor.log &
echo "finishing starting $node's supervisor"
else
echo "supervisor process id: \$sid"
fi
END
echo
done
}
#同步配置文件
sync_config(){
for node in $slaves
do
scp /opt/package/apache-storm-0.9.2-incubating/conf/storm.yaml root@$node:/opt/package/apache-storm-0.9.2-incubating/conf/
echo "finishing sync $node config!"
done
}
mytest(){
for node in $slaves
do
ssh $node <<END
ls
END
done
}
if [ "$1" = "stop" ] ; then
stop_supervisor
elif [ "$1" = "start" ]; then
start_supervisor
elif [ "$1" = "sync" ]; then
sync_config
elif [ "$1" = "check" ]; then
check_supervisors
else
mytest
fi