python脚本实时监控日志
日志每天都会割接重建,所以脚本需要每天都重启(通过crontab shell脚本实现),因为割接重建后tail -f拿不到添加的日志条目。
先来实现一个最简单的,主要用到popen。
#!/usr/bin/env python
# ./log_watcher.py -f log
import sys
import os
import getopt
import subprocess
import time
def worker(line):
backup = open("backup.txt","a")
backup.write(line + "\n")
backup.close()
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'f:')
except getopt.GetoptError, err:
print str(err)
sys.exit(1)
filename = ''
for k, v in opts:
if k == '-f':
filename = v
if not (filename and os.path.exists(filename)):
print 'invalid filename : %s' % filename
sys.exit(1)
cmd = ('tail', '-f', filename)
print ' '.join(cmd)
output = subprocess.Popen(cmd, stdout=subprocess.PIPE)
while True:
try:
line = output.stdout.readline()
except KeyboardInterrupt:
print "Quit"
sys.exit(1)
if not line:
time.sleep(0.01)
continue
line = line.strip().decode('utf8')
#do your job here
worker(line)
time.sleep(0.01)
return 0
if __name__ == '__main__':
sys.exit(main())
将worker函数替换为所需要执行的任务(比如发起异步请求,多线程操作等)就可以了,为了演示方便,这里仅仅将内容写到别的文件
为了让程序后台运行,可以使用nohup ./log_watcher.py -f log &
为了防止日志产生过快new很多线程出来,可以引用线程池来处理日志队列,为了防止队列过大,需要再灵活控制队列数目,要注意cpu占用率,可以再适当的代码加sleep。
考虑到监控脚本可能异常down掉,需要记录当前解析到的位置(使用tail -n指定位置),不然tail -f取出的消息可能重复或者丢失。经过我的测试,当log文件很大的时候调用wc -l有延时,导致tail的位置不够准确,而且调用tail系统cpu占用过高。下面用简单的文件操作实现。
#!/usr/bin/env python
# ./log_watcher.py -f log
import Queue
import threading
import sys
import os
import getopt
import time
import urllib, urllib2
import random
thread_count = 5
current_pos = 0
mutex = threading.Lock()
class MyThread(threading.Thread):
def __init__(self, workQueue, resultQueue,timeout=0, **kwargs):
threading.Thread.__init__(self, kwargs=kwargs)
self.timeout = 0
self.setDaemon(True)
self.workQueue = workQueue
self.resultQueue = resultQueue
self.start()
def run(self):
while True:
try:
callable, args, kwargs = self.workQueue.get(timeout=self.timeout)
while True:
res = callable(args, self.getName())
if res == 1:
break
time.sleep(1)
except Queue.Empty:
time.sleep(1)
continue
except :
print sys.exc_info()
#raise
class ThreadPool:
def __init__( self, num_of_threads=10):
self.workQueue = Queue.Queue()
self.resultQueue = Queue.Queue()
self.threads = []
self.__createThreadPool( num_of_threads )
def __createThreadPool( self, num_of_threads ):
for i in range( num_of_threads ):
thread = MyThread( self.workQueue, self.resultQueue )
self.threads.append(thread)
def wait_for_complete(self):
while len(self.threads):
thread = self.threads.pop()
if thread.isAlive():
thread.join()
def add_job( self, callable, args, **kwargs ):
while True:
if self.workQueue.qsize() < 10000:
self.workQueue.put( (callable,args,kwargs) )
break
time.sleep(0.1)
def worker(pline, threadid):
splitPos = pline.index("###")
pos = pline[0:splitPos]
line = pline[splitPos+3:]
#deal with each log
time.sleep(0.001)
back = open("/usr/local/nginx/logs/access.log.bak","a")
if mutex.acquire(1):
global current_pos
current_pos = pos
backup = open("log_watcher.pos","w")
backup.write(current_pos)
backup.close()
backup = open("log_watcher.pos.bak","w")
backup.write(current_pos)
backup.close()
back.write(line+"\n")
mutex.release()
back.close()
return 1
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'f:c:h:')
except getopt.GetoptError, err:
print str(err)
sys.exit(1)
filename = ''
global current_pos
current_pos = 0
global mn_url
for k, v in opts:
if k == '-f':
filename = v
elif k == '-c':
current_pos = int(v)
elif k == '-h':
local_ip = v
if not (filename and os.path.exists(filename)):
print 'invalid filename : %s' % filename
sys.exit(1)
input = open(filename);
input.seek(current_pos)
threadPool = ThreadPool(thread_count);
while True:
pos = input.tell()
line = input.readline()
line = line.strip() #trim the last "\n"
if not line:
time.sleep(0.3)
continue
threadPool.add_job(worker, "%d###%s"%(pos,line))
return 0
if __name__ == '__main__':
sys.exit(main())
通过shell脚本运行(crontab中实现定时重启机制),shell脚本先kill掉过期的监控程序,启动新的监控脚本,从新的位置开始监控
#!/bin/bash LOGNAME="/usr/local/nginx/logs/access.log" killall python sleep 1 cd /usr/local/nginx/sbin/ #log currentpos if [ -e "log_watcher.pos" ] then currentPos=`cat log_watcher.pos` if [ -z $currentPos ] then currentPos=`cat log_watcher.pos.bak` if [ -z $currentPos ] then exit -1 fi fi else currentPos=0 fi #refresh pos if log has been dealt fileSize=`ls -l $LOGNAME | awk '{print $5}'` if [ -z $currentPos ] then currentPos=0 fi if [ $fileSize -lt $currentPos ] then rm /usr/local/nginx/logs/access.log.bak currentPos=0 fi localip=`/sbin/ifconfig -a|grep "inet addr:10."|awk '{print $2}'|awk -F: '{print $2}'|sed q` CMD="./log_watcher.py -f $LOGNAME -c $currentPos -h $localip" $CMD
一个简单的功能实现下来也是需要很多代码,主要考虑了各种异常情况,多线程处理需要考虑同步等
使用tail -f虽然可以做到实时统计日志,但是程序重启时依赖wc -l这样的外部命令,耗时导致不够准确
可以用文件操作来实现
上面其实是用轮询查看日志是否有更新,还可以结合inotify库实现真正的实时监控操作。
参照这篇博客:http://blog.youkuaiyun.com/fancyerii/article/details/6738564