数据落地压缩转码打包下发python脚本
版权声明:本文为博主原创文章,转载请注明出处:https://blog.youkuaiyun.com/sgqhappy/article/details/83986623
最近一直在研究大量数据通过hive清洗提取后的落地下发问题,为了简化业务并且将业务自动化,我把数据落地压缩转码打包下发等功能写成脚本执行,运用了当前比较流行的python语言,如有疑问,请感兴趣的朋友们留言多多交流。
1、导包
#!/usr/bin/python
#coding:utf-8
#Made by sgqhappy
#Date: 20181112
#function: data landing
from subprocess import Popen,PIPE
import os
import sys
import re
import commands
import logging
2、定义一个类,用来打印脚本运行的log日志
日志既可以打印在控制台上,也可以输出到log文件。
class Logger(object):
def __init__(self,log_file_name,log_level,logger_name):
self.__logger = logging.getLogger(logger_name);
self.__logger.setLevel(log_level);
file_handler = logging.FileHandler(log_file_name);
console_handler = logging.StreamHandler();
#set log format and show log at console and log_file.
LOG_FORMAT = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s : %(message)s";
formatter = logging.Formatter(LOG_FORMAT);
file_handler.setFormatter(formatter);
console_handler.setFormatter(formatter);
self.__logger.addHandler(file_handler);
self.__logger.addHandler(console_handler);
def get_log(self):
return self.__logger;
3、定义main
if __name__ == '__main__':
4、定义文件名及文件路径
file_name_input = "%s_%s_%s" % (sys.argv[3],sys.argv[1],sys.argv[4]);
file_name = file_name_input.lower();
#this is record name and path.
record_name = "data_down_record.txt";
record_path = "/data_down/";
info_log_path = '/data_down/%s.down.log' % (file_name);
logger = Logger(log_file_name="%s" % (info_log_path),log_level=logging.DEBUG,logger_name="myLogger").get_log();
#this is down path.
down_path = "/data/p%s/%s/%s" % (sys.argv[1],sys.argv[1],sys.argv[2]);
logger.info("\n");
logger.info("down path: %s" % (down_path));
5、在服务器上创建下发的文件夹,并输出日志,如果报错,则显示错误信息并退出程序。
#function: create folder.
create_folder = "mkdir -p %s" % (down_path);
logger.info(create_folder);
status,output = commands.getstatusoutput(create_folder);
logger.info(output);
#logger.info success or failed information.
if status == 0:
logger.info("mkdir %s successful!" % (down_path));
else:
#set color: '\033[;31;40m'+...+'\033[0m'
logger.error('\033[;31;40m'+"mkdir %s failed!" % (down_path)+'\033[0m');
#exit program.
exit();
6、数据落地,并输出日志,如果报错,则显示错误信息并退出程序。
#function: getmerge.
getmerge = "hdfs dfs -getmerge hdfs://ip/hive/warehouse/database.db/%s %s/%s.utf8" % (file_name,down_path,file_name);
logger.info(getmerge);
status,output = commands.getstatusoutput(getmerge);
logger.info(output);
#logger.info success or failed information.
if status == 0:
logger.info("getmerge successful!" % (down_path));
else:
#set color: '\033[;31;40m'+...+'\033[0m'
logger.error('\033[;31;40m'+"getmerge failed!" % (down_path)+'\033[0m');
#exit program.
exit();
7、数据压缩,并输出日志,如果报错,则显示错误信息并退出程序。
#function: gzip.
gzip = "gzip %s/%s.utf8" % (down_path,file_name);
logger.info(gzip);
status,output = commands.getstatusoutput(gzip);
logger.info(output);
#logger.info success or failed information.
if status == 0:
logger.info("%s successful!" % (gzip));
else:
#set color: '\033[;31;40m'+...+'\033[0m'
logger.error('\033[;31;40m'+"%s failed!" % (gzip)+'\033[0m');
#exit program.
exit();
8、显示数据前十行,抽样查看数据是否完整,并输出日志,如果报错,则显示错误信息并退出程序。
#function: zcat.
zcat = "zcat %s/%s.utf8.gz | head -n 10" % (down_path,file_name);
logger.info(zcat);
status,output = commands.getstatusoutput(zcat);
logger.info(output);
#logger.info success or failed information.
if status == 0:
logger.info("%s successful!" % (zcat));
else:
#set color: '\033[;31;40m'+...+'\033[0m'
logger.error('\033[;31;40m'+"%s failed!" % (zcat)+'\033[0m');
#exit program.
exit();
9、显示数据大小,并输出日志,如果报错,则显示错误信息并退出程序。
#function: show the size of down_file.
size = "du -sh %s/%s.utf8.gz" % (down_path,file_name);
logger.info(size);
status,output = commands.getstatusoutput(size);
#the size of record_file
record_size = output;
logger.info('\033[;33;41m'+"The size is : %s" % (output)+'\033[0m');
#logger.info success or failed information.
if status == 0:
logger.info("%s successful!" % (size));
else:
#set color: '\033[;31;40m'+...+'\033[0m'
logger.error('\033[;31;40m'+"%s failed!" % (size)+'\033[0m');
#exit program.
exit();
logger.info("\n");
10、保存相关信息到文件,并输出日志,如果报错,则显示错误信息并退出程序。
#append to record.txt
output = open("%s%s" % (record_path,record_name),'a');
output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ('table_name','code','down_date','data_end_date','size','down_file_name'));
output.write("%s\t%s\t%s\t%s\t%s\n" % (sys.argv[3],sys.argv[1],sys.argv[2],sys.argv[4],record_size));
output.close();
#logger.info the data extraction success information.
logger.info('\033[1;35;40m'+"********Data down success!********"+'\033[0m');
logger.info('\033[1;35;40m'+"********Made by sgqhappy in %s!********" % (sys.argv[2])+'\033[0m');
logger.info("\n");
11、在Linux中运行命令
python landing.py code 20181112 table 20180801