hive monitor开发经验

本文介绍了一个使用Python实现的HiveServer监控脚本,该脚本能定期检查HiveServer的状态,并在服务不可用时自动重启。此外,还讨论了避免频繁连接导致的问题。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

由于hive server集群稳定很重要,我们采用了haproxy来保障ha,对单台hive server也需要进行监控。

代码如下,(python)

#!/usr/bin/env python
import sys, os, time, atexit, string,socket
from signal import SIGTERM 
from hive_service import ThriftHive
from hive_service.ttypes import HiveServerException
from thrift import Thrift
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol


class Daemon:
        def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
                self.stdin = stdin
                self.stdout = stdout
                self.stderr = stderr
                self.pidfile = pidfile
        def hive_alive(self):#连接hive server,检测是否逻辑联通
                port=10001
                ip='192.168.1.45'
                sql="show databases"
                try:
                    transport = TSocket.TSocket(ip, port) 
                    transport.setTimeout(60000)
                    transport = TTransport.TBufferedTransport(transport)
                    protocol = TBinaryProtocol.TBinaryProtocol(transport)
                    client = ThriftHive.Client(protocol)
                    transport.open()
                    client.execute(sql)
                    rows = client.fetchAll()
                    if "yclick" in rows:
                                   return 1
                    transport.close()
                    return 0
                except Thrift.TException, tx:
                    return 0
                except socket.error, e:
                    return 0
                except:
                    return 0
                return 0
        def kill_hive(self):
            os.system("kill -9 `ps -ef | grep java | grep hive | grep -v 'grep' | awk '{print $2}'`")
            rfile=os.popen("netstat -lnp | grep 10001 | grep -o .......java","r")
            r=rfile.read()
            rfile.close()
            if r.find("java")>0:
                            cmd = "kill -9 "+r.split("/java")[0]
                            print "kill cmd",cmd
                            os.system(cmd)
        def run_hive(self):
                os.system('export HADOOP_HOME=/xxxx/hadoop-1.0.x; export JAVA_HOME=/usr/java/jdk1.6.0_21; /opt/modules
/hive/hive-0.10.0/bin/hive --service hiveserver 10001 &')
                time.sleep(2)
        def _daemonize(self):
                try: 
                        pid = os.fork() 
                        if pid > 0:
                                sys.exit(0)
                except OSError, e:
                        sys.stderr.write('fork #1 failed: %d (%s)\n' % (e.errno, e.strerror))
                        sys.exit(1)

                os.chdir("/") 
                os.setsid() 
                os.umask(0) 

                try: 
                        pid = os.fork() 
                        if pid > 0:
                                sys.exit(0) 
                except OSError, e: 
                        sys.stderr.write('fork #2 failed: %d (%s)\n' % (e.errno, e.strerror))
                        sys.exit(1) 

                sys.stdout.flush()
                sys.stderr.flush()
                si = file(self.stdin, 'r')
                so = file(self.stdout, 'a+')
                se = file(self.stderr, 'a+', 0)
                os.dup2(si.fileno(), sys.stdin.fileno())
                os.dup2(so.fileno(), sys.stdout.fileno())
                os.dup2(se.fileno(), sys.stderr.fileno())

                atexit.register(self.delpid)
                pid = str(os.getpid())
                file(self.pidfile,'w+').write('%s\n' % pid)

        def delpid(self):
                os.remove(self.pidfile)

        def start(self):
                alive = 0
                try:
                        pf = file(self.pidfile,'r')
                        pid = pf.read().strip()
                        cmd = 'netstat -anp | grep 10001 | grep LISTEN | wc -l'
                        #cmd = 'ps -fe | grep ' + pid +' | grep -v grep | wc -l'
                        sys.stderr.write('cmd %s\n' % cmd)
                        response = os.popen(cmd).read().strip()
                        sys.stderr.write('response %s\n\n' % response)
                        if response == '1':
                                alive = 1
                                sys.stderr.write('Hive process is alive !!!\n')
                        pf.close()
                except IOError:
                        pid = None

                if alive:
                        message = 'pidfile (%s) already exist. Daemon already running?\n\n'
                        sys.stderr.write(message % self.pidfile)
                        sys.exit(1)

                sys.stderr.write('starting hive server')
                self._daemonize()
                self._run()

        def stop(self):
                try:
                        pf = file(self.pidfile,'r')
                        pid = int(pf.read().strip())
                        pf.close()
                except IOError:
                        pid = None

                if not pid:
                        message = 'pidfile %s does not exist. Daemon not running?\n'
                        sys.stderr.write(message % self.pidfile)
                        return 

                try:
                        while 1:
                                os.kill(pid, SIGTERM)
                                time.sleep(0.1)
                                os.system("kill -9 `ps -ef | grep java | grep hive | grep -v 'grep' | awk '{print $2}'`")
                except OSError, err:
                        err = str(err)
                        if err.find('No such process') > 0:
                                if os.path.exists(self.pidfile):
                                        os.remove(self.pidfile)
                        else:
                                print str(err)
                                sys.exit(1)

        def restart(self):
                self.stop()
                self.start()
        def monitor(self):
                self._run()
        def _run(self):
                while True:
                        pf = file(self.pidfile,'r')
                        pid = pf.read().strip()
                        sys.stderr.write('pid %s\n\n' % pid)
                        cmd = 'netstat -anp | grep 10001 | grep LISTEN | wc -l'
                        #cmd = 'ps -fe | grep ' + pid +' | grep -v grep | wc -l'
                        hive_proc = os.popen(cmd).read().strip()
                        sys.stderr.write('response %s\n\n' % hive_proc)
                        if "0" in hive_proc  :
                               sys.stderr.write('Hive process is dead!!!\n')
                               self.run_hive()
                        else:
                               time.sleep(300)#惨疼教训,不要连接过频繁
                               if self.hive_alive()==0:
                                  self.kill_hive()
                                  sys.stderr.write('Hive process killed,restart!!!\n')
                                  self.run_hive()
                                  time.sleep(30)
if __name__ == '__main__':
        daemon = Daemon('/xxxx/hive.pid')
        if len(sys.argv) == 2:
                if 'start' == sys.argv[1]:
                        daemon.start()
                elif 'stop' == sys.argv[1]:
                        daemon.stop()
                elif 'restart' == sys.argv[1]:
                        daemon.restart()
                elif 'monitor' == sys.argv[1]:
                        daemon.monitor()
                else:
                        print 'Unknown command'
                        sys.exit(2)
                sys.exit(0)
        else:
                print 'usage: %s start|stop|restart' % sys.argv[0]
                sys.exit(2)

惨疼教训是:

不要频繁连接hive server,每次连接都会在/tmp/下生成临时文件,很容易就会导致hive无法创建文件,导致失败,再导致hive 监控失败的怪圈。

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值