由于hive server集群稳定很重要,我们采用了haproxy来保障ha,对单台hive server也需要进行监控。
代码如下,(python)
#!/usr/bin/env python
import sys, os, time, atexit, string,socket
from signal import SIGTERM
from hive_service import ThriftHive
from hive_service.ttypes import HiveServerException
from thrift import Thrift
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol
class Daemon:
def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.pidfile = pidfile
def hive_alive(self):#连接hive server,检测是否逻辑联通
port=10001
ip='192.168.1.45'
sql="show databases"
try:
transport = TSocket.TSocket(ip, port)
transport.setTimeout(60000)
transport = TTransport.TBufferedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = ThriftHive.Client(protocol)
transport.open()
client.execute(sql)
rows = client.fetchAll()
if "yclick" in rows:
return 1
transport.close()
return 0
except Thrift.TException, tx:
return 0
except socket.error, e:
return 0
except:
return 0
return 0
def kill_hive(self):
os.system("kill -9 `ps -ef | grep java | grep hive | grep -v 'grep' | awk '{print $2}'`")
rfile=os.popen("netstat -lnp | grep 10001 | grep -o .......java","r")
r=rfile.read()
rfile.close()
if r.find("java")>0:
cmd = "kill -9 "+r.split("/java")[0]
print "kill cmd",cmd
os.system(cmd)
def run_hive(self):
os.system('export HADOOP_HOME=/xxxx/hadoop-1.0.x; export JAVA_HOME=/usr/java/jdk1.6.0_21; /opt/modules
/hive/hive-0.10.0/bin/hive --service hiveserver 10001 &')
time.sleep(2)
def _daemonize(self):
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError, e:
sys.stderr.write('fork #1 failed: %d (%s)\n' % (e.errno, e.strerror))
sys.exit(1)
os.chdir("/")
os.setsid()
os.umask(0)
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError, e:
sys.stderr.write('fork #2 failed: %d (%s)\n' % (e.errno, e.strerror))
sys.exit(1)
sys.stdout.flush()
sys.stderr.flush()
si = file(self.stdin, 'r')
so = file(self.stdout, 'a+')
se = file(self.stderr, 'a+', 0)
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
atexit.register(self.delpid)
pid = str(os.getpid())
file(self.pidfile,'w+').write('%s\n' % pid)
def delpid(self):
os.remove(self.pidfile)
def start(self):
alive = 0
try:
pf = file(self.pidfile,'r')
pid = pf.read().strip()
cmd = 'netstat -anp | grep 10001 | grep LISTEN | wc -l'
#cmd = 'ps -fe | grep ' + pid +' | grep -v grep | wc -l'
sys.stderr.write('cmd %s\n' % cmd)
response = os.popen(cmd).read().strip()
sys.stderr.write('response %s\n\n' % response)
if response == '1':
alive = 1
sys.stderr.write('Hive process is alive !!!\n')
pf.close()
except IOError:
pid = None
if alive:
message = 'pidfile (%s) already exist. Daemon already running?\n\n'
sys.stderr.write(message % self.pidfile)
sys.exit(1)
sys.stderr.write('starting hive server')
self._daemonize()
self._run()
def stop(self):
try:
pf = file(self.pidfile,'r')
pid = int(pf.read().strip())
pf.close()
except IOError:
pid = None
if not pid:
message = 'pidfile %s does not exist. Daemon not running?\n'
sys.stderr.write(message % self.pidfile)
return
try:
while 1:
os.kill(pid, SIGTERM)
time.sleep(0.1)
os.system("kill -9 `ps -ef | grep java | grep hive | grep -v 'grep' | awk '{print $2}'`")
except OSError, err:
err = str(err)
if err.find('No such process') > 0:
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
else:
print str(err)
sys.exit(1)
def restart(self):
self.stop()
self.start()
def monitor(self):
self._run()
def _run(self):
while True:
pf = file(self.pidfile,'r')
pid = pf.read().strip()
sys.stderr.write('pid %s\n\n' % pid)
cmd = 'netstat -anp | grep 10001 | grep LISTEN | wc -l'
#cmd = 'ps -fe | grep ' + pid +' | grep -v grep | wc -l'
hive_proc = os.popen(cmd).read().strip()
sys.stderr.write('response %s\n\n' % hive_proc)
if "0" in hive_proc :
sys.stderr.write('Hive process is dead!!!\n')
self.run_hive()
else:
time.sleep(300)#惨疼教训,不要连接过频繁
if self.hive_alive()==0:
self.kill_hive()
sys.stderr.write('Hive process killed,restart!!!\n')
self.run_hive()
time.sleep(30)
if __name__ == '__main__':
daemon = Daemon('/xxxx/hive.pid')
if len(sys.argv) == 2:
if 'start' == sys.argv[1]:
daemon.start()
elif 'stop' == sys.argv[1]:
daemon.stop()
elif 'restart' == sys.argv[1]:
daemon.restart()
elif 'monitor' == sys.argv[1]:
daemon.monitor()
else:
print 'Unknown command'
sys.exit(2)
sys.exit(0)
else:
print 'usage: %s start|stop|restart' % sys.argv[0]
sys.exit(2)
惨疼教训是:
不要频繁连接hive server,每次连接都会在/tmp/下生成临时文件,很容易就会导致hive无法创建文件,导致失败,再导致hive 监控失败的怪圈。