flume数据采集之后,本打算按照五分钟滚动一个文件的,但是由于实时性要求提高,改为一分钟一个文件。但是开启了三个flume实例,导致hdfs小文件过多。
1.影响NameNode,
2.影响task数量
决定写一个flume的合并脚本,写着写着,功能越来越多,越来越完善。
合并方案我写过两个:
1.使用spark进行读取合并,优点是scala代码书写,高级语言编程,易于理解维护,但是不方便,每次要打包。
2.使用shell脚本进行getmerger,然后split成128MB大小的文件。
前面一种方式很简单,这里就不列出如何实现了。
#后面代码
shell合并脚本思路:
1. getmerge into tmpfile
2. split into block file(128MB)
3.put to hdfs
4. check file
5. rm hdfs small file
6. rm local tmpfile
这段shell,基本上通过变量传值,有一定的shell编程经验才能看懂,通用性蛮强的,具体的有空贴出来,不过使用的时候还是要先做好测试。
#! /bin/sh
# vim /data/executeTask/file_compact/compact_flumedata.sh
# ref: sendmessage.sh
# 1. PROGRAMID : add id and program desc into programelist.log
# 2. errortimes : error times
# 3. errorcode : error code
# 4. DesNo1 : destination phone number
#################### check result function ####################
checksuccess(){
project=$1
projectId=10
errorCode=$2
processname=$3
yesterday=$4
logfile=$5
table_name=$6
phoneNumber=15202125865
errorTime=1
echo " project=$project errorCode=$errorCode processname=$processname yesterday=$yesterday logfile=$logfile table_name=$table_name"
if [ $errorCode -ne 0 ]; then
echo "`date +%F\ %T` ${project} errorCode=${errorCode} error process $table_name ${processname} of ${yesterday} data " >> ${logfile}
/data/executeTask/file_compact/sendmessage.sh $projectId $yesterday $errorCode $phoneNumber
exit ${errorCode}
else
echo "`date +%F\ %T` ${project} errorCode=${errorCode} success process $table_name ${processname} of ${yesterday} data " >> ${logfile}
fi
}
#################### check result function end ####################
#################### processCompact function start ####################
processCompact(){
table_name=$1
date=$2
compact_file=$3
logfile=$4
project=$5
# 0. rm tmp copy data eg. compact.1510051113._COPYING_ created while copy but job failed
# TODO rm all compact file
hdfs dfs -rm /data/$table_name/$date/compact*_COPYING_
# asume size is measured by GB maybe is mb
begainsize=`hdfs dfs -du -s -h /data/$table_name/$date/ | awk '{ print $1}' `
#errorCode=$?
#processname=rmCopytmp
#checksuccess $project $errorCode $processname $date $logfile $table_name
# 1. get to local
hdfs dfs -getmerge /data/$table_name/$date/*FlumeData* /data/executeTask/file_compact/$compact_file
errorCode=$?
processname=getmerge
checksuccess $project $errorCode $processname $date $logfile $table_name
# split into 128m
sizeunit=`hdfs dfs -du -s -h /data/$table_name/$date/ | awk '{ print $2}' `
# test
#begainsize=`hdfs dfs -du -s -h /data/Gps_Log/20171106/ | awk '{ print $1}' `
#sizeunit=`hdfs dfs -du -s -h /data/Gps_Log/20171106/ | awk '{ print $2}' `
#if [ $sizeunit = "G" ];then
# res=$(printf "%.f" `echo "scale=5;$begainsize*8 "|bc`)
#else
# res=$(printf "%.f" `echo "scale=5;$begainsize/128 "|bc`) # celling 取整数 http://blog.youkuaiyun.com/naiveloafer/article/details/8783518
#fi
#echo $res
if [ $sizeunit = "G" ];then
res=$(printf "%.f" `echo "scale=5;$begainsize*8 "|bc`)
else
res=$(printf "%.f" `echo "scale=5;$begainsize/128 "|bc`) # celling 取整数 http://blog.youkuaiyun.com/naiveloafer/article/details/8783518
fi
cd /data/executeTask/file_compact/
# split into $res files with number suffix. reg http://blog.youkuaiyun.com/microzone/article/details/52839598
compact_file_name=$compact_file"_"
echo "compact_file_name :"$compact_file_name
split -n l/$res /data/executeTask/file_compact/$compact_file -d -a 3 /data/executeTask/file_compact/${compact_file_name}
#for test
#res=34
#compact_file=compact.151018888
#split -n l/$res /data/executeTask/file_compact/tmp/$compact_file -d -a 3 ${compact_file}"_"
# 2 copyFromLocal.
hdfs dfs -copyFromLocal /data/executeTask/file_compact/$compact_file_name* /data/$table_name/$date/
errorCode=$?
processname=copyFromLocal
checksuccess $project $errorCode $processname $date $logfile $table_name
# 3. chown this step can be skip
hdfs dfs -chown flume /data/$table_name/$date/$compact_file_name*
errorCode=$?
processname=chown
checksuccess $project $errorCode $processname $date $logfile $table_name
# 4
# deprecated endsize=`hdfs dfs -du -s -h /data/$table_name/$date/ | awk '{ print $1}' `
endsize=`hdfs dfs -du -s -h /data/$table_name/$date/compact* | awk '{sum+=$1/1024};END{print sum} ' `
echo $begainsize >> $logfile
echo $endsize >> $logfile
percent=`echo "scale=2;$begainsize/$endsize" | bc`
upper=1.2
lower=0.8
c1=$(echo "$percent < $upper" | bc)
c2=$(echo "$percent > $lower" | bc)
if [ $c1 -eq 1 ] && [ $c2 -eq 1 ];then
echo " accessable rate"
else
echo " warn rate"
processname=sizerate
checksuccess $project 1 $processname $date $logfile $table_name
fi
# 5. rm data after check option
hdfs dfs -rm /data/$table_name/$date/*FlumeData*
errorCode=$?
processname=rmFlumeData
checksuccess $project $errorCode $processname $date $logfile $table_name
# 6. rm local compact file # remove local file before getmerge
rm -rf /data/executeTask/file_compact/$compact_file*
errorCode=$?
processname=rmLocalFile
checksuccess $project $errorCode $processname $date $logfile $table_name
}
#################### processCompact function END ####################
#################### main start ####################
#################### config begin ####################
project=obdAndipda
#the begin day to process
date=20170216
logfile=/data/job/log/compact_flumedata.log
#the end day to process
end=`date -d' -1 day' +"%Y%m%d"`
end=20170216
echo "start: ${date} end: $end"
#################### config end ####################
while(( $date <= $end ))
do
#################### prepare ####################
# prepare
dealtime=`date +%s`
compact_file="compact."${dealtime}
echo "deal date:"${date}
#################### prepare END ####################
table_name="table_name"
echo "/data/$table_name/$date/"
processCompact $table_name $date $compact_file $logfile $project $table_name
date=`date -d"-1 day ago ${date}" +%Y%m%d`
done
echo "complete";
exit 0
#################### main end ####################