DWD层构建思路
ODS层数据是近源层,基本上不在这层做过大的格式或数据转换(一般仅对数据进行格式上的转换,比如说对json、URL等格式的日志文件,提取出列数据),因此在DWD层,我们会把数据做一定的整理和优化,并加入一些常用维度,比如说日期、地域等,下面案例简单介绍下DWD层
方向一:
创建分区表
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_myshops.dwd_logs partition(actdate)
select userid,action,acttime,version,system,goodid,title,
price,shopid,mark,soft,soft_version,
from_unixtime(cast(acttime/1000 as bigint),'yyyyMMdd') as actdate
from ods_myshops.ods_newlog;
方向二:
使用压缩格式,本例使用orc格式
-- 导入ods_orders dwd_myshops.dwd_orders
create table dwd_myshops.dwd_orders(
orderid int,
orderdate string,
userid int,
orderstatus int,
orderno string,
goodid int,
buynum int
)
row format delimited fields terminated by ','
stored as orc;
insert overwrite table dwd_myshops.dwd_orders select * from ods_myshops.ods_orders;
方向三:
增加常用维度,比如时间、地域、职业、学历等维度
-- 创建日期表并导入一年日期
create table dwd_myshops.dwd_dates(
id int,
fdate string,
years int,
months int,
dates int,
week int,
quarter int
)
row format delimited
fields terminated by ','
stored as orc;
本例完整代码如下:
-- 创建dwd层数据库
drop database if exists dwd_myshops cascade;
create database dwd_myshops;
-- 创建dwd用户行为表,动态分区
create table dwd_myshops.dwd_logs(
userid int,
action string,
acttime string,
version string,
system string,
goodid string,
title string,
price string,
shopid string,
mark string,
soft string,
soft_version string
)
partitioned by (actdate string)
row format delimited fields terminated by ','
stored as orc;
-- 开启动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_myshops.dwd_logs partition(actdate)
select userid,action,acttime,version,system,goodid,title,
price,shopid,mark,soft,soft_version,
from_unixtime(cast(acttime/1000 as bigint),'yyyyMM') as actdate
from ods_myshops.ods_newlog;
-- 导入ods_orders dwd_myshops.dwd_orders
create table dwd_myshops.dwd_orders(
orderid int,
orderdate string,
userid int,
orderstatus int,
orderno string,
goodid int,
buynum int
)
row format delimited fields terminated by ','
stored as orc;
insert overwrite table dwd_myshops.dwd_orders select * from ods_myshops.ods_orders;
--导入商品表
create table dwd_myshops.dwd_goods(
goodid int,
title string,
price double,
typeid int,
issale int,
score int,
shopid int,
paytype int,
delailtabname string
)
row format delimited fields terminated by ','
stored as orc;
insert overwrite table dwd_myshops.dwd_goods select * from ods_myshops.ods_goods;
-- 导入用户表
create table dwd_myshops.dwd_userinfos(
userid int,
username string,
name string,
email string,
password string,
cardno string,
gender int,
localid int,
mobile string,
qq string,
wechat string
)
row format delimited fields terminated by ','
stored as orc;
insert overwrite table dwd_myshops.dwd_userinfos select * from ods_myshops.ods_userinfos;
-- 创建日期表并导入一年日期
create table dwd_myshops.dwd_dates(
id int,
fdate string,
years int,
months int,
dates int,
week int,
quarter int
)
row format delimited
fields terminated by ','
stored as orc;
shell脚本生成日期维度
#!/bin/bash
begin_time=$1
over_time=$2
# 计算两个日期间的天数
#将用户输入的时间转为时间戳
btimestamp=`date -d ${begin_time} +%s`
otimestamp=`date -d ${over_time} +%s`
let days=($otimestamp-$btimestamp)/86400
#循环向hive中填充对应数据
int=0
str="insert overwrite table dwd_myshops.dwd_dates values "
while(( $int<=$days ))
do
# 数字‘08’会产生二进制异常,在计算时会被系统判为8进制,需要转为10进制
fdy=`date -d "${begin_time}+${int} days" "+%Y-%m-%d"`
# 在date基础上提取
# years=${fdy:0:4}
# months=${fdy:5:2}
# dy=${fdy:8:2}
# week=`date -d ${fdy} +%w`
years=`date -d "${begin_time}+${int} days" "+%Y"`
months=`date -d "${begin_time}+${int} days" "+%m"`
dy=`date -d "${begin_time}+${int} days" "+%d"`
week=`date -d "${begin_time}+${int} days" "+%w"`
quarter=1
if [ $months -lt 4 ];then quarter=1
elif [ $months -lt 7 ];then quarter=2
elif [ $months -lt 10 ];then quarter=3
else
quarter=4;
fi
# echo "${months}"
let "int++"
# 执行hive命令完成数据填充
str=$str"(${int},'${fdy}',${years},${months},${dy},${week},${quarter}),"
done
hive -e "${str%?}"