1. 新建拉链表
drop table if exists dwd_dim_user_info_his;
create external table dwd_dim_user_info_his
(
id string COMMENT '用户 id',
name string COMMENT '姓名',
birthday string COMMENT '生日',
gender string COMMENT '性别',
email string COMMENT '邮箱',
user_level string COMMENT '用户等级',
create_time string COMMENT '创建时间',
operate_time string COMMENT '操作时间',
start_date string COMMENT '有效开始日期',
end_date string COMMENT '有效结束日期'
)
COMMENT '用户拉链表'
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/dwd/dwd_dim_user_info_his/';
2. 导入数据到拉链表;初始化拉链表
insert overwrite table dwd_dim_user_info_his
select id, name, birthday, gender, email, user_level, create_time, operate_time, '2025-08-11',
'9999-99-99'
from ods_user_info oi
where oi.dt='2025-08-11';

手动:新增加2条2025-08-12的数据,同时修改1条2025-08-11的数据;
执行脚本bash mysql_to_hdfs.sh user_info 2025-08-12

后再导入数据;
hive -e "load data inpath '/origin_data/duoduo_db/db/user_info/2025-08-12' OVERWRITE into table duoduo_db.ods_user_info partition(dt='2025-08-12');"
此时ods_user_info表里应该有43条数据;

4.新建拉链表的临时表
drop table if exists dwd_dim_user_info_his_tmp;
create external table dwd_dim_user_info_his_tmp
(
id string COMMENT '用户 id',
name string COMMENT '姓名',
birthday string COMMENT '生日',
gender string COMMENT '性别',
email string COMMENT '邮箱',
user_level string COMMENT '用户等级',
create_time string COMMENT '创建时间',
operate_time string COMMENT '操作时间',
start_date string COMMENT '有效开始日期',
end_date string COMMENT '有效结束日期'
)
COMMENT '用户拉链临时表'
row format delimited fields terminated by '\t'
location '/origin_data/duoduo_db/dwd/dwd_dim_user_info_his_tmp/';
查询如下:
select id, name, birthday, gender, email, user_level, create_time,operate_time,
'2025-08-12' start_date,'9999-99-99' end_date
from ods_user_info where dt= '2025-08-12';

select d.id, d.name, d.birthday, d.gender, d.email,
d.user_level, d.create_time,d.operate_time, d.start_date,
if(o.id is not null,'2025-08-12',d.end_date) end_date
from dwd_dim_user_info_his d
left join ods_user_info o on d.id=o.id and o.dt='2025-08-12'
and d.end_date='9999-99-99';
显示如下:

5.插入到拉链表的临时表
insert overwrite table dwd_dim_user_info_his_tmp
select d.id, d.name, d.birthday, d.gender, d.email,
d.user_level, d.create_time,d.operate_time, d.start_date,
if(o.id is not null,'2025-08-12',d.end_date) end_date
from dwd_dim_user_info_his d
left join ods_user_info o on d.id=o.id and o.dt='2025-08-12'
and d.end_date='9999-99-99'
union
select id, name, birthday, gender, email, user_level, create_time,operate_time,
'2025-08-12' start_date,'9999-99-99' end_date
from ods_user_info where dt= '2025-08-12';
(共计:40+3=43行数据,保留了原始数据)

这个只是为了修改修改过的那条数据的end_date,就这一个作用(用到left join的原因)

这个是为了下次的修改使用的固定格式
6.把拉链表的临时表中的所有数据覆盖到拉链表中
insert overwrite table dwd_dim_user_info_his
select * from dwd_dim_user_info_his_tmp;
7. 验证拉链表中的数据
select * from dwd_dim_user_info_his;


705

被折叠的 条评论
为什么被折叠?



