1.创建hive表,插入数据
创建test_a
create table test_a(id int,a string,p_day_id string);
insert into table test_a select 1,'a1','20230201';
insert into table test_a select 2,'a2','20230201';
创建test_b
create table test_b(id int,b string,p_day_id string,month_number int);
insert into table test_b select 1,'b1','20230201',1;
insert into table test_b select 2,'b2','20230201',null;
创建test_c合并两天数据,分区为month_number
hive> create table test_c(id int,a string,b string,p_day_id string) PARTITIONED BY(month_number int);
编写插入数据的sql
hive> select a.id,a.a,b.b,a.p_day_id,b.month_number from test_a a left join test_b b on a.id=b.id where a.p_day_id=b.p_day_id and a.p_day_id='20230201';
1 a1 b1 20230201 1
2 a2 b2 20230201 NULL
将当前结果插入test_c,用动态分区的方式插入,其中第2列数据存在null分区
hive> set hive.exec.dynamic.partition.mode=nonstrict;
hive> insert overwrite table test_c partition(month_number) select a.id,a.a,b.b,b.month_number from test_a a left join test_b b on a.id=b.id where a.p_day_id=b.p_day_id and a.p_day_id='20230201';
查看表分区,产生默认分区__HIVE_DEFAULT_PARTITION__
hive> show partitions test_c ;
OK
month_number=1
month_number=__HIVE_DEFAULT_PARTITION__
查看表数据,统计结果,注意: p_day_id ='20230201'
hive> select * from test_c where p_day_id ='20230201';
1 a1 b1 20230201 1
2 a2 b2 20230201 NULL
hive> select count(1) where p_day_id ='20230201';
2
添加新账期,month_number为null的数据,其中p_day_id='20230202',区别上面账期
insert into table test_a select 3,'a3','20230202';
insert into table test_a select 4,'a4','20230202';
insert into table test_b select 3,'b3','20230202',1;
insert into table test_b select 4,'b4','20230202',null;
查看新账期p_day_id='20230202'结果
hive> select a.id,a.a,b.b,a.p_day_id,b.month_number from test_a a left join test_b b on a.id=b.id where a.p_day_id=b.p_day_id and a.p_day_id='20230202';
3 a3 b3 20230202 1
4 a4 b4 20230202 NULL
插入p_day_id='20230202'的数据
hive> insert overwrite table test_c partition(month_number) select a.id,a.a,b.b,a.p_day_id,b.month_number from test_a a left join test_b b on a.id=b.id where a.p_day_id=b.p_day_id and a.p_day_id='20230202';
再次查看表,发现分区相同的全部覆盖,20230201的数据完全没了,其中分区为null的也全部覆盖
hive> select * from test_c;
OK
3 a3 b3 20230202 1
4 a4 b4 20230202 NULL