源码:
"""
说明:其实本脚本是用来给flume使用的,所以看起来比较奇怪
flume配置:
pagent.sources.xxsource.command = python -u /xxx/monitor_file.py --path /path/xx
实现功能如下:
1. 重写tail -f
2. 多线程启用tail
3. 杀死线程
4. 遍历多级目录,获取新的文件或被删除的文件
"""
#!/usr/bin/python
#coding=utf-8
import os
import sys
import time
import optparse
import threading
import inspect
import ctypes
def _async_raise(tid, exctype):
tid = ctypes.c_long(tid)
if not inspect.isclass(exctype):
exctype = type(exctype)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
if res == 0:
raise ValueError("invalid thread id")
elif res != 1:
ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
raise SystemError("PyThreadState_SetAsyncExc failed")
def stop_thread(thread):
_async_raise(thread.ident, SystemExit)
"""
python版 tail -f,直接继承threading.Thread
"""
class Tail(threading.Thread):
def __init__(self, tailed_file):
threading.Thread.__init__(self)
self.tailed_file = tailed_file
self.callback = sys.stdout.write
def run(self):
s = 0.1
with open(self.tailed_file) as file_:
file_.seek(0,2)
while True:
curr_position = file_.tell()
line = file_.readline()
if not line:
file_.seek(curr_position)
time.sleep(s)
else:
self.callback(line)
def register_callback(self, func):
self.callback = func
"""
遍历目录,并启动线程
"""
class IteratorFile(object):
def __init__(self, filepath):
self.filepath = filepath
self.init_files = self.init_file()
'''
初始化时执行一次,启动监控文件的脚本,用于记录所有当前存在的文件
'''
def init_file(self):
now_files = self.iterator_file()
self.start_tail_threads(now_files[:-1])
return now_files
'''
监控文件,每秒一次,并校验是否有变化,变化则杀死现有进程,并启动新的进程
'''
def monitor_file(self):
s = 1
while True:
now_files = self.iterator_file()
# 如果当前的文件集合多余初始化时的文件集合
if len(now_files.split(",")) > len(self.init_files.split(",")):
self.start_tail_threads(self.get_new_files()[:-1])
# 如果当前的文件集合少于初始化时的文件集合,即有文件删除,此时重启所有线程,后续再行调优吧..
elif len(now_files.split(",")) < len(self.init_files.split(",")):
self.restart_tail_threads(now_files[:-1])
self.init_files = now_files
time.sleep(s)
'''
遍历目录,得到所有的文件
'''
def iterator_file(self):
result_files = ""
root = os.walk(self.filepath)
for dirpath, dirlist, filelist in root:
for filename in filelist:
result_files += os.path.join(dirpath, filename)+","
return result_files
'''
找到当前有,而初始化时没有的文件集合
'''
def get_new_files(self):
now_files = self.iterator_file()
result_files = ""
for tmp_file in now_files.split(","):
if self.init_files.find(tmp_file) == -1:
result_files += tmp_file+","
return result_files
'''
找到初始化时有,而当前没有的文件集合
'''
def get_old_files(self):
now_files = self.iterator_file()
result_files = ""
for tmp_file in self.init_files.split(","):
if now_files.find(tmp_file) == -1:
result_files += tmp_file+","
return result_files
'''
启动tail线程
'''
def start_tail_threads(self, files):
threads = []
for ifile in files.split(","):
threads.append(Tail(ifile))
for t in threads:
t.start()
'''
重启tail线程
'''
def restart_tail_threads(self, files):
for tt in threading.enumerate():
if str(tt).find("_MainThread") == -1:
stop_thread(tt)
self.start_tail_threads(files)
if __name__ == '__main__':
parser = optparse.OptionParser()
parser.add_option(
'--path',
dest='path',
help='directories that need to be monitored',
metavar='PATH',
)
options, args = parser.parse_args()
if options.path:
try:
iterator_ = IteratorFile(options.path)
iterator_.monitor_file()
except KeyboardInterrupt:
sys.exit(0)
else:
parser.print_help()