递归增量监控目录/文件,逐行读取内容并输出

本文介绍了一个使用Python实现的tail-f工具,能够监控指定路径下的文件更新情况,支持多线程并能应对文件增删变动。通过遍历多级目录,该脚本可以启动多个线程来跟踪每个文件的变化。

源码:


"""
说明:其实本脚本是用来给flume使用的,所以看起来比较奇怪
flume配置:
pagent.sources.xxsource.command = python -u /xxx/monitor_file.py --path /path/xx

实现功能如下:
1. 重写tail -f
2. 多线程启用tail
3. 杀死线程
4. 遍历多级目录,获取新的文件或被删除的文件
"""

#!/usr/bin/python
#coding=utf-8

import os
import sys
import time
import optparse
import threading
import inspect
import ctypes

def _async_raise(tid, exctype):

    tid = ctypes.c_long(tid)
    if not inspect.isclass(exctype):
        exctype = type(exctype)
    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
    if res == 0:
        raise ValueError("invalid thread id")
    elif res != 1:
        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
        raise SystemError("PyThreadState_SetAsyncExc failed")

def stop_thread(thread):
    _async_raise(thread.ident, SystemExit)

"""
python版 tail -f,直接继承threading.Thread
"""
class Tail(threading.Thread):

    def __init__(self, tailed_file):
        threading.Thread.__init__(self)
        self.tailed_file = tailed_file
        self.callback = sys.stdout.write

    def run(self):
        s = 0.1
        with open(self.tailed_file) as file_:
            file_.seek(0,2)
            while True:
                curr_position = file_.tell()
                line = file_.readline()
                if not line:
                    file_.seek(curr_position)
                    time.sleep(s)
                else:
                    self.callback(line)

    def register_callback(self, func):
        self.callback = func

"""
遍历目录,并启动线程
"""
class IteratorFile(object):

    def __init__(self, filepath):
        self.filepath = filepath
        self.init_files = self.init_file()

    '''
    初始化时执行一次,启动监控文件的脚本,用于记录所有当前存在的文件
    '''
    def init_file(self):
        now_files = self.iterator_file()
        self.start_tail_threads(now_files[:-1])
        return now_files

    '''
    监控文件,每秒一次,并校验是否有变化,变化则杀死现有进程,并启动新的进程
    '''
    def monitor_file(self):
        s = 1
        while True:
            now_files = self.iterator_file()

            # 如果当前的文件集合多余初始化时的文件集合
            if len(now_files.split(",")) > len(self.init_files.split(",")):
                self.start_tail_threads(self.get_new_files()[:-1])
            # 如果当前的文件集合少于初始化时的文件集合,即有文件删除,此时重启所有线程,后续再行调优吧..
            elif len(now_files.split(",")) < len(self.init_files.split(",")):
                self.restart_tail_threads(now_files[:-1])

            self.init_files = now_files
            time.sleep(s)

    '''
    遍历目录,得到所有的文件
    '''
    def iterator_file(self):

        result_files = ""
        root = os.walk(self.filepath)
        for dirpath, dirlist, filelist in root:
            for filename in filelist:
                result_files += os.path.join(dirpath, filename)+","
        return result_files

    '''
    找到当前有,而初始化时没有的文件集合
    '''
    def get_new_files(self):
        now_files = self.iterator_file()
        result_files = ""
        for tmp_file in now_files.split(","):
            if self.init_files.find(tmp_file) == -1:
                result_files += tmp_file+","

        return result_files

    '''
    找到初始化时有,而当前没有的文件集合
    '''
    def get_old_files(self):
        now_files = self.iterator_file()
        result_files = ""
        for tmp_file in self.init_files.split(","):
            if now_files.find(tmp_file) == -1:
                result_files += tmp_file+","

        return result_files

    '''
    启动tail线程
    '''
    def start_tail_threads(self, files):

        threads = []
        for ifile in files.split(","):
            threads.append(Tail(ifile))

        for t in threads:
            t.start()

    '''
    重启tail线程
    '''
    def restart_tail_threads(self, files):
        for tt in threading.enumerate():
            if str(tt).find("_MainThread") == -1:
                stop_thread(tt)

        self.start_tail_threads(files)

if __name__ == '__main__':

    parser = optparse.OptionParser()
    parser.add_option(
        '--path',
        dest='path',
        help='directories that need to be monitored',
        metavar='PATH',
    )

    options, args = parser.parse_args()

    if options.path:
        try:
           iterator_ = IteratorFile(options.path)
           iterator_.monitor_file()

        except KeyboardInterrupt:
            sys.exit(0)

    else:
        parser.print_help()


转载于:https://my.oschina.net/remainsu/blog/1593536

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值