1. 测试环境中建表
由于show create table对中文的乱码,用desc table拼接建表语句。
1 #!/usr/bin/env bash 2 # 获取dw、temp库【database=dw设置】里的所有表的建表语句。会生成【table_all.txt】、【table_exist.txt】、【库名.all.txt】文件。 3 # 以及临时文件【库名.表名.txt】会在程序中删除。 4 # 在新环境的hive中运行【库名.all.txt】文件中的内容建表。 5 # 新环境ip地址:10.1.8.210,搜索以替换。 6 7 database=dw 8 #tmp_path=/home/hadoop/tiantian/transformer/reverse/${database} 9 10 hive -e "use $database;show tables;" > table_all.txt 11 ssh 10.1.8.210 "source ~/.bash_profile;hive -e \" use $database; show tables;\"" > table_exist.txt 12 table_list=`sort -m <(sort table_all.txt | uniq) <(sort table_exist.txt | uniq) <(sort table_exist.txt | uniq) | uniq -u` 13 14 for table in $table_list: 15 do 16 #table=${path##*/} 17 table=${table/:/} 18 #获取一个表名 19 echo "*********************************"$table"********************************" 20 21 #构造建表语句dw表 22 hive -e "use $database;desc $table;" | sed 's/^[ \t]*//g' | sed -e '/^$/d'| awk '{ print "`"$1"`" " " $2 }'> ${database}.${table}.txt 23 lines=`cat ${database}.${table}.txt |wc -l` 24 linei=$(grep -n "#\` Partition" ${database}.${table}.txt | awk -F":" '{print $1}') 25 if [ -z "$linei" ]; then 26 var=$lines 27 linei=$lines 28 else 29 var=`expr $linei + $linei - $lines` 30 fi 31 32 33 i=1 34 createsql="create table "$database"."${table}" ( " 35 while read content 36 do 37 if [ $i -le $((var)) ]; then 38 createsql=${createsql}${content}" " 39 if [ $i -lt $((var)) ];then 40 createsql=${createsql}", " 41 else 42 createsql=${createsql}")" 43 fi 44 else 45 if [ $i -ge $((linei+2)) ]; then 46 if [ $i -eq $((linei+2)) ]; then 47 createsql=${createsql}" PARTITIONED BY ( " 48 fi 49 createsql=${createsql}${content}" " 50 if [ $i -lt $lines ];then 51 createsql=${createsql}", " 52 else 53 createsql=${createsql}") " 54 fi 55 fi 56 fi 57 i=$((i + 1)) 58 done < ${database}.${table}.txt 59 60 61 createsql=${createsql}"ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.SequenceFileInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat';" 62 echo $createsql >>${database}.all.txt 63 64 rm ${database}.${table}.txt 65 #新hive建表 66 #hive –f /root/shell/hive-script.sql(适合多语句) 67 #ssh 10.1.8.210 "source ~/.bash_profile;hive -e \"${createsql}\"" 68 done
2. 从正式环境导入数据到测试环境
1 #!/usr/bin/env bash 2 # 输出参数:$1开始日期、$2结束日期、$3表名 3 4 # 硬代码: 5 # hadoop临时文件路径:/tiantian/ 6 # 默认dw库 7 8 start_date=$1 9 end_date=$2 10 table=$3 11 12 #create copy table path if not exist. 13 file=`hadoop fs -ls /tiantian/${table}` 14 nums=`echo $file | wc -L` 15 if [ "$nums" = "0" ]; then 16 hadoop fs -mkdir /tiantian/$table 17 fi 18 19 begin=`date -d "$start_date" +%s` 20 end=`date -d "$end_date" +%s` 21 while [ "$begin" -le "$end" ] 22 do 23 ntime=$(date -d @$begin +"%Y-%m-%d") 24 25 26 #check if data has already been loaded 27 file=`hadoop fs -ls /user/hive/warehouse/dw.db/${table}/part_date=${ntime}` 28 nums=`echo $file | wc -L` 29 30 while [ "$nums" = "0" ] 31 do 32 echo "data date"${ntime} 33 #低版本到高版本拷贝时如果checksum.type报错,添加【-update -skipcrccheck】参数或加【-Ddfs.checksum.type=CRC32】参数 34 hadoop distcp hftp://10.1.9.99:50070/user/hive/warehouse/dw.db/${table}/part_date=${ntime} /tiantian/${table} 35 hive -e "load data inpath '/tiantian/"${table}"/part_date="${ntime}"' OVERWRITE into table dw."${table}" partition (part_date='"${ntime}"');" 36 file=`hadoop fs -ls /user/hive/warehouse/dw.db/${table}/part_date=${ntime}` 37 nums=`echo $file | wc -L` 38 if [ "$nums" = "0" ]; then 39 echo "copy failed" 40 #sleep 1h #waiting for data ready 41 fi 42 done 43 44 begin=$((begin+86400)) 45 done
##测试用数据需单独拷贝的 ##TODO:dictionary data. hadoop fs -mkdir hdfs://ns1/user/hadoop/splitword_dict hadoop fs -mkdir hdfs://ns1/user/hadoop/stopword_dict hadoop fs -mkdir hdfs://ns1/user/hadoop/targetword_dict hadoop distcp hftp://10.1.9.99:50070/user/hadoop/splitword_dict/ hdfs://ns1/user/hadoop/splitword_dict hadoop distcp hftp://10.1.9.99:50070/user/hadoop/stopword_dict hdfs://ns1/user/hadoop/stopword_dict hadoop distcp hftp://10.1.9.99:50070/user/hadoop/targetword_dict hdfs://ns1/user/hadoop/targetword_dict hadoop fs -mkdir hdfs://ns1/tiantian hadoop distcp hftp://10.1.9.99:50070/user/hive/warehouse/temp.db/hejy_sy_id_area/ hdfs://ns1/tiantian/hejy_sy_id_area ##TODO:load data : load data inpath '/tiantian' OVERWRITE into table temp.hejy_sy_id_area ;
3. 测试环境业务代码测试与性能评估
1 #!/usr/bin/env bash 2 # 输入参数: 3 # $1 功能选项(1:运行hql,2:检查结果ERROR_LIST.txt,平均用时统计)、 4 # $2 包含语句列表及hql的文件名、 5 # $3 运行模式(1:mr,2:spark)、 6 # $4 一条语句运行次数 7 # nohup bash start_hql.sh 3 all_hql.sh 1 1 > nohup 2>&1 & 8 9 stage=$1 10 hqlfile=$2 11 mode=$3 12 times=$4 13 14 source $hqlfile 15 case $mode in 16 1) 17 #echo 'You select 1, mr' 18 prefix="set hive.execution.engine=mr; " 19 mode_name="mr"; 20 ;; 21 2) 22 #echo 'You select 2, spark' 23 prefix="set hive.execution.engine=spark; " 24 mode_name="spark"; 25 ;; 26 *) echo 'You do not select a mode number between 1 to 2' 27 ;; 28 esac 29 30 case $stage in 31 1) 32 cd running 33 for i in `seq $times` 34 do 35 for hql_name in ${hql_list[@]} 36 do 37 hql=`eval echo '$'"$hql_name"` 38 hql="use dw;"$prefix$hql 39 #echo $hql 40 # beeline -u jdbc:hive2://10.1.8.206:10000 -n hadoop -e "select count(*) from daoyu_user_info;" 41 # beeline -u jdbc:hive2://10.1.8.206:10000 -n hadoop -hiveconf hive.execution.engine=mr -e "select count(*) from daoyu_user_info;" 42 hive -e "$hql" &> ../result/$mode_name"_"$hql_name"."$i".txt" 43 done 44 done 45 cd .. 46 ;; 47 2) 48 #checking result 49 cd result 50 for hql_name in ${hql_list[@]} 51 do 52 #echo $hql_name >>../EVALUATION.txt 53 var_snd=0.0 54 counter=0 55 for i in `seq $times` 56 do 57 var_name=$mode_name"_"$hql_name"."$i".txt" 58 sed -i '/WARN: /d' $var_name 59 check=`tail -n 1 $var_name | grep 'Time taken: '` 60 if [ -z "$check" ]; then 61 echo 'error: '$var_name >>../ERROR_LIST.txt 62 else 63 let counter++ 64 time_list=`cat $var_name | grep 'Time taken: '|awk '{print $3}'` 65 for tmp_b in ${time_list[@]} 66 do 67 var_snd=`echo "$var_snd + $tmp_b" | bc` 68 done 69 fi 70 done 71 if [ $counter -gt 0 ]; then 72 avg_time=`echo "scale=3; $var_snd / $counter" | bc` 73 echo $hql_name$'\t'$avg_time>>../EVALUATION.txt 74 else 75 echo $hql_name$'\t''ALL_FAIL' >>../EVALUATION.txt 76 fi 77 done 78 cd .. 79 ;; 80 *) echo 'You do not select a stage number between 1 to 2' 81 ;; 82 esac