根据ETL报错的信息,用Python从源文件中提取出错误的内容

这段Python代码用于从源文件中提取出错误行的内容。它首先读取包含错误行号的文件,然后将这些行号转换为整数,最后在源文件中找到对应的行并将其写入新的文件。
# -*- coding:utf-8 -*-
#标准代码
import linecache
#lines存储错误的行
lines=[]        #错误行,字符串形式列表
error_lines=[]  #错误行,不包括首行
int_lines=[]    #错误行,整数形式列表,不包括首行
error_lines_file_path=r'C:\Users\yawei.chen\Desktop\Puck\Python\Job\test\error_seq.txt' #需要抽取的错误行数
source_file_path=r'C:\Users\yawei.chen\Desktop\Puck\Python\Job\test\iclog100.y2018.d0513.s00001.20180513002906.txt'#源文件
extract_file_path=r"C:\Users\yawei.chen\Desktop\IM22407681.txt"   #抽取的错误行文件,存储位置和文件名

#可控制函数
def truncate_file(error_lines_file_path,source_file_path,extract_file_path):
    # 读取错误的行数,并输出到error_lines
    with open(error_lines_file_path, "r", encoding='utf-8') as y:
        for line in y:
            lines.append(line)
            error_lines = lines[2:len(lines) + 1]

        # 生成int_lines,将error_lines中的字符串转换成int格式
        for int_line in error_lines:
            int_lines.append(int(int_line))

    # 在源文件中,提取错误行的内容
    with open(source_file_path, 'r', encoding='utf-8') as f1, \
            open(extract_file_path, 'w', encoding='utf-8') as f2:
        # 将错误行写入到extract_file_path文件中
        for int_line in int_lines:
            count = linecache.getline(source_file_path, int_line)
            f2.write(count)

truncate_file(error_lines_file_path,source_file_path,extract_file_path)
#!/usr/local/bin/python3.9 from datetime import datetime import os,sys # fpath='/root/log/' # fname='2021-05-20.log' # rname='etl_2021-05-20.txt' # etl_file=f'{fpath}{fname}' # result_file=f'{fpath}{rname}' def etl_data(etl_file,result_file): with open(etl_file,'r',encoding='utf-8') as f: with open(result_file,'a',encoding='utf-8') as f1: for r in f.readlines(): l=r.split(' ') ip=l[0] date=l[3] date=datetime.strptime(date,'[%d/%b/%Y:%H:%M:%S]') date=datetime.strftime(date,'%Y-%m-%d %H:%M:%S') url=l[6] header=r.split('"')[-2] if header.find('Windows')>=0: systemtype='Windows' elif header.find('Mac OS')>=0: systemtype='Mac_OS' elif header.find('Linux')>=0: systemtype='Linux' else: systemtype='unknown' browser_list=['Chrome','Safari','Firefox','Presto'] for bw in browser_list: if header.find(bw)>=0: browser=bw break else: browser='unknown' result=f'{ip}\t{date}\t{url}\t{systemtype}\t{browser}\n' f1.write(result) def load_data(result_file,load_date): sql_command=''' create table log( ip string, date_ string, url string, systype string, browser string ) partition by (load_date string) row format DELIMITED fields TERMINATED by '\\t' lines TERMINATED by '\\n' ''' sql_command=f''' load data local inpath '{result_file}' overwrite into table log partition(load_date='{load_date}') ''' beeline_command=f'beeline -u jdbc:hive2://hadoop100:10000/db_hive -n root -p 123456 -e "{sql_command}" ' print(beeline_command) os.system(beeline_command) def test_value(load_date): if len(load_date)!=8: print('请输入正确的参数:yyyymmdd') exit() elif load_date.isdigit() is False: print('请输入数字参数:yyyymmdd') exit() else: return load_date def test_file(filepath): if os.path.isfile(filepath): return filepath else: print(f'{filepath}文件不存在') return False def etl_logfile(etl_file,result_file,load_date): if test_file(etl_file) and test_file(result_file) is False: print(f'文件存在,开始清洗{etl_file}...') etl_data(etl_file,result_file) print(f'文件清洗完毕,开始上传{result_file}...') load_data(result_file,load_date) print(f'上传完毕{result_file}...') elif test_file(etl_file) and test_file(result_file): print('结果文件存在,删除结果文件,重新清洗...') os.remove(result_file) print(f'文件存在 开始清洗{etl_file}...') etl_data(etl_file,result_file) print(f'文件清洗完毕 开始上传{result_file}...') load_data(result_file,load_date) print(f'上传完毕{result_file}...') if __name__=='__main__': fpath='/root/log/' if len(sys.argv)<=1: print(f'未输入参数遍历{fpath},查找文件') for fname in os.listdir(fpath): etl_file=f'{fpath}{fname}' _, ext=os.path.splitext(etl_file) if ext=='.log': rname=f'etl_{fname.replace(".log",".txt")}' result_file=f'{fpath}{rname}' load_date=etl_file.replace('-','').strip('.log') etl_logfile(etl_file,result_file,load_date) else: load_date=test_value(sys.argv[1]) fname=f'{load_date[0:4]}-{load_date[4:6]}-{load_date[6:8]}.log' rname=f'etl_{fname.replace(".log",".txt")}' etl_file=f'{fpath}{fname}' result_file=f'{fpath}{rname}' etl_logfile(etl_file,result_file,load_date)
09-21
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值