Hive升级版本对比测试过程`seq $times`

最新推荐文章于 2025-04-14 18:42:06 发布

weixin_33815613

最新推荐文章于 2025-04-14 18:42:06 发布

阅读量135

点赞数

CC 4.0 BY-SA版权

文章标签：大数据数据库 awk

原文链接：http://www.cnblogs.com/tiantianblog/p/7069903.html

本文介绍如何在测试环境中构建Hive表结构，并从生产环境导入数据进行业务验证。主要内容包括：通过脚本批量创建Hive表，利用HDFS复制数据到测试环境，以及执行HQL查询并评估其性能。

1. 测试环境中建表

由于show create table对中文的乱码，用desc table拼接建表语句。

 1 #!/usr/bin/env bash
 2 # 获取dw、temp库【database=dw设置】里的所有表的建表语句。会生成【table_all.txt】、【table_exist.txt】、【库名.all.txt】文件。
 3 # 以及临时文件【库名.表名.txt】会在程序中删除。
 4 # 在新环境的hive中运行【库名.all.txt】文件中的内容建表。
 5 # 新环境ip地址：10.1.8.210，搜索以替换。
 6 
 7 database=dw
 8 #tmp_path=/home/hadoop/tiantian/transformer/reverse/${database}
 9 
10 hive -e "use $database;show tables;" > table_all.txt
11 ssh  10.1.8.210 "source ~/.bash_profile;hive -e \" use $database; show tables;\"" > table_exist.txt
12 table_list=`sort -m <(sort table_all.txt | uniq) <(sort table_exist.txt | uniq) <(sort table_exist.txt | uniq) | uniq -u`
13 
14 for table in $table_list:
15 do
16     #table=${path##*/}
17     table=${table/:/}
18     #获取一个表名
19     echo "*********************************"$table"********************************"
20 
21     #构造建表语句dw表
22     hive -e "use $database;desc $table;" | sed 's/^[ \t]*//g' | sed -e '/^$/d'| awk '{ print "`"$1"`" " " $2 }'> ${database}.${table}.txt
23     lines=`cat  ${database}.${table}.txt |wc -l`
24     linei=$(grep -n "#\` Partition"  ${database}.${table}.txt | awk -F":" '{print $1}')
25     if [ -z "$linei" ]; then
26         var=$lines
27         linei=$lines
28     else
29         var=`expr $linei + $linei - $lines`
30     fi
31 
32 
33     i=1
34     createsql="create table "$database"."${table}" ( "
35     while read content
36     do
37         if [ $i -le $((var)) ]; then
38             createsql=${createsql}${content}" "
39             if [ $i -lt $((var)) ];then
40                 createsql=${createsql}", "
41             else
42                 createsql=${createsql}")"
43             fi
44         else
45             if [ $i -ge $((linei+2)) ]; then
46                 if [ $i -eq $((linei+2)) ]; then
47                     createsql=${createsql}"  PARTITIONED BY ( "
48                 fi
49                 createsql=${createsql}${content}" "
50                 if [ $i -lt $lines ];then
51                     createsql=${createsql}", "
52                 else
53                     createsql=${createsql}") "
54                 fi
55             fi
56         fi
57         i=$((i + 1))
58     done < ${database}.${table}.txt
59 
60 
61     createsql=${createsql}"ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.SequenceFileInputFormat' OUTPUTFORMAT  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat';"
62     echo $createsql >>${database}.all.txt
63 
64     rm ${database}.${table}.txt
65     #新hive建表
66     #hive –f  /root/shell/hive-script.sql（适合多语句）
67     #ssh  10.1.8.210 "source ~/.bash_profile;hive -e \"${createsql}\""
68 done

2. 从正式环境导入数据到测试环境

 1 #!/usr/bin/env bash
 2 # 输出参数：$1开始日期、$2结束日期、$3表名
 3 
 4 # 硬代码：
 5 # hadoop临时文件路径：/tiantian/
 6 # 默认dw库
 7 
 8 start_date=$1
 9 end_date=$2
10 table=$3
11 
12 #create copy table path if not exist.
13 file=`hadoop fs -ls /tiantian/${table}`
14 nums=`echo $file | wc -L`
15 if [ "$nums" = "0" ]; then
16     hadoop fs -mkdir /tiantian/$table
17 fi
18 
19 begin=`date -d "$start_date" +%s`
20 end=`date -d "$end_date" +%s`
21 while [ "$begin" -le "$end" ]
22 do
23     ntime=$(date -d @$begin +"%Y-%m-%d")
24 
25 
26     #check if data has already been loaded
27     file=`hadoop fs -ls /user/hive/warehouse/dw.db/${table}/part_date=${ntime}`
28     nums=`echo $file | wc -L`
29 
30     while [ "$nums" = "0" ]
31     do
32         echo "data date"${ntime}
33         #低版本到高版本拷贝时如果checksum.type报错,添加【-update -skipcrccheck】参数或加【-Ddfs.checksum.type=CRC32】参数
34         hadoop distcp hftp://10.1.9.99:50070/user/hive/warehouse/dw.db/${table}/part_date=${ntime}  /tiantian/${table}
35         hive -e "load data inpath '/tiantian/"${table}"/part_date="${ntime}"' OVERWRITE into table dw."${table}" partition (part_date='"${ntime}"');"
36         file=`hadoop fs -ls /user/hive/warehouse/dw.db/${table}/part_date=${ntime}`
37         nums=`echo $file | wc -L`
38         if [ "$nums" = "0" ]; then
39             echo "copy failed"
40             #sleep 1h #waiting for data ready
41         fi
42     done
43 
44     begin=$((begin+86400))
45 done

##测试用数据需单独拷贝的
##TODO:dictionary data.
hadoop fs -mkdir hdfs://ns1/user/hadoop/splitword_dict
hadoop fs -mkdir hdfs://ns1/user/hadoop/stopword_dict
hadoop fs -mkdir hdfs://ns1/user/hadoop/targetword_dict
hadoop distcp hftp://10.1.9.99:50070/user/hadoop/splitword_dict/  hdfs://ns1/user/hadoop/splitword_dict
hadoop distcp hftp://10.1.9.99:50070/user/hadoop/stopword_dict   hdfs://ns1/user/hadoop/stopword_dict
hadoop distcp hftp://10.1.9.99:50070/user/hadoop/targetword_dict   hdfs://ns1/user/hadoop/targetword_dict
hadoop fs -mkdir hdfs://ns1/tiantian
hadoop distcp hftp://10.1.9.99:50070/user/hive/warehouse/temp.db/hejy_sy_id_area/  hdfs://ns1/tiantian/hejy_sy_id_area
##TODO:load data : load data inpath '/tiantian' OVERWRITE into table temp.hejy_sy_id_area ;

3. 测试环境业务代码测试与性能评估

 1 #!/usr/bin/env bash
 2 # 输入参数：
 3 # $1 功能选项(1:运行hql,2:检查结果ERROR_LIST.txt,平均用时统计)、
 4 # $2 包含语句列表及hql的文件名、
 5 # $3 运行模式(1:mr,2:spark)、
 6 # $4 一条语句运行次数
 7 # nohup bash start_hql.sh 3 all_hql.sh 1 1 > nohup 2>&1 &
 8 
 9 stage=$1
10 hqlfile=$2
11 mode=$3
12 times=$4
13 
14 source $hqlfile
15 case $mode in
16     1)
17     #echo 'You select 1, mr'
18     prefix="set hive.execution.engine=mr; "
19     mode_name="mr";
20     ;;
21     2)
22     #echo 'You select 2, spark'
23     prefix="set hive.execution.engine=spark; "
24     mode_name="spark";
25     ;;
26     *)  echo 'You do not select a mode number between 1 to 2'
27     ;;
28 esac
29 
30 case $stage in
31     1)
32     cd running
33     for i in `seq $times`
34     do
35         for hql_name in ${hql_list[@]}
36         do
37             hql=`eval echo '$'"$hql_name"`
38             hql="use dw;"$prefix$hql
39             #echo $hql
40             #  beeline -u jdbc:hive2://10.1.8.206:10000 -n hadoop -e "select count(*) from daoyu_user_info;"
41             #  beeline -u jdbc:hive2://10.1.8.206:10000 -n hadoop -hiveconf hive.execution.engine=mr -e "select count(*) from daoyu_user_info;"
42             hive -e "$hql" &>  ../result/$mode_name"_"$hql_name"."$i".txt"
43         done
44     done
45     cd ..
46     ;;
47     2)
48     #checking result
49     cd result
50     for hql_name in ${hql_list[@]}
51     do
52         #echo $hql_name >>../EVALUATION.txt
53         var_snd=0.0
54         counter=0
55         for i in `seq $times`
56         do
57             var_name=$mode_name"_"$hql_name"."$i".txt"
58             sed -i '/WARN: /d' $var_name
59             check=`tail -n 1 $var_name | grep 'Time taken: '`
60             if [ -z "$check" ]; then
61                 echo 'error: '$var_name >>../ERROR_LIST.txt
62             else
63                 let counter++
64                 time_list=`cat $var_name | grep 'Time taken: '|awk '{print $3}'`
65                 for tmp_b in ${time_list[@]}
66                 do
67                     var_snd=`echo "$var_snd + $tmp_b" | bc`
68                 done
69             fi
70         done
71         if [ $counter -gt 0 ]; then
72             avg_time=`echo "scale=3; $var_snd / $counter" | bc`
73             echo $hql_name$'\t'$avg_time>>../EVALUATION.txt
74         else
75             echo $hql_name$'\t''ALL_FAIL' >>../EVALUATION.txt
76         fi
77     done
78     cd ..
79     ;;
80     *)  echo 'You do not select a stage number between 1 to 2'
81     ;;
82 esac