创建数据表
hive> create database cus;
hive> use cus;
hive> create table telno_md5(
> phone string,
> md5 string )
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY ','
> STORED AS TEXTFILE;
导入数据
hive> load data local inpath '/home/etluser/data/' into table telno_md5;
创建重新分区表
hive> create table telno_md5_prt(
> phone string,
> md5 string )
> partitioned by (prefix string);
使用动态分区,插入数据
hive> set hive.exec.dynamic.partition=true;
hive> set hive.exec.dynamic.partition.mode=nonstrict;
hive> set hive.exec.max.dynamic.partitions.pernode=100000;
hive> set hive.exec.max.dynamic.partitions=100000;
hive> set hive.exec.max.created.files=1000000000;
hive> insert into table telno_md5_prt
> partition (prefix)
> select phone,md5,substr(md5,1,2) as prefix
> from telno_md5;
与桶表的比较
create table telno_md5_bucketed(
phone string,
md5 string )
clustered by(md5) into 1024 buckets;
insert overwrite table telno_md5_bucketed
select phone,md5 from telno_md5;
执行结果比较
数据分割方式 | 实际分割文件数 | 执行时间 | 关联查询时间 |
---|---|---|---|
dynamic partitions | 998 | 27m36s | 16m23s |
bucketed table | 668 | 16m2s | 6m5s |