通过ImportTsv 批量将数据从hive导出到hbase
实测 2E 行数据 耗时5分钟左右
#!/bin/bash
report_date=`date --date='1 day ago' +%Y-%m-%d`
echo $report_date
function print_duration(){
#传入一个参数,开始时间 YYYY-mm-DD HH:MM:SS 格式
start_time=`echo ${@:1}` #因为参数中带有空格,所以需要这样处理
end_time=`date +'%Y-%m-%d %H:%M:%S'`
echo '脚本执行开始时间:'${start_time}',脚本执行结束时间:'${end_time}
start_seconds=$(date --date="$start_time" +%s);
end_seconds=$(date --date="$end_time" +%s);
duration_s=$((end_seconds-start_seconds)); ##总耗时秒
duration_m=$((duration_s/60)); ##总耗时分
remainder_s=$((duration_s%60)); #余数秒
duration=${duration_m}"分"${remainder_s}"秒"
echo "########################本次运行耗时:"${duration}"#######################"
}
starttime=`date +'%Y-%m-%d %H:%M:%S'`
echo "#脚本执行开始时间:"$starttime
#1、抽取hive数据,导出到hdfs
#create 'rtc:hz_test', {NAME => 'cf', VERSIONS => 3} ,SPLITS => ['10000000000','20000000000','30000000000','40000000000','50000000000','60000000000','70000000000','80000000000','90000000000']
hive -e \
"set mapred.max.split.size=134217728; \
set hive.merge.mapfiles=false; \
set hive.merge.mapredfiles= false; \
set hive.merge.smallfiles.avgsize=1048576; \
SET hive.exec.compress.output=false; \
INSERT OVERWRITE DIRECTORY '/tmp/output/ads_act_vvm_biz' ROW FORMAT DELIMITED FIELDS TERMINATED by '\t' select reverse(user_num),null as cf from dw.ads_act_a_daily t where dt='${report_date}' and act_type=1;"
#2、删除表。不能使用清空表,清空表的话会将表的分区也清除掉
echo "disable 'rtc:ads_act_vvm_biz'" | hbase shell
echo "drop 'rtc:ads_act_vvm_biz'" | hbase shell
echo "create 'rtc:ads_act_vvm_biz', {NAME => 'cf', VERSIONS => 3},SPLITS=>['00','05','10','15','20','25','30','35','40','45','50','55','60','65','70','75','80','85','90','95']" | hbase shell
#3、将hdfs数据导入到 转换成hfile
hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.bulk.output=hdfs:///tmp/output/ads_act_vvm_biz_hfile/$report_date/ -Dimporttsv.columns=HBASE_ROW_KEY,cf:f1 "rtc:ads_act_vvm_biz" hdfs:///tmp/output/ads_act_vvm_biz
#4、导入hbase
hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles hdfs:///tmp/output/ads_act_vvm_biz_hfile/$report_date/ rtc:ads_act_vvm_biz
print_duration ${starttime}