1 业务数据
业务数据一般存入mysql中,过程较为单一简单,不再赘述
2 业务数据导入数仓
2.1 表的同步策略
序号 | 表名 | 解释 | 同步策略 |
1 | order_info | 订单表 | 新增及变化 |
2 | order_detail | 订单详情 | 增量 |
3 | sku_info | 商品表 | 全量 |
4 | user_info | 用户表 | 全量 |
5 | base_category1 | 商品一级分类表 | 全量 |
6 | base_category2 | 商品二级分类表 | 全量 |
7 | base_category3 | 商品三级分类表 | 全量 |
8 | payment_info | 支付流水表 | 增量 |
2.2 Sqoop导入脚本
#!/bin/bash
db_date=$2
echo ${db_date}
db_name=gmall
import_data(){
/opt/module/sqoop/bin/sqoop import \
--connect jdbc:mysql://hadoop:3306/${db_name} \
--username root \
--password 000000 \
--target-dir /origin_data/${db_name}/db/$1/${db_date} \
--delete-target-dir \
--num-mappers 1 \
--fields-terminated-by "\t" \
--query "$2"' and ${conditions};'
}
import_sku_info(){
import_data "sku_info" "
SELECT
id,
spu_id,
price,
sku_name,
sku_desc,
weight,
tm_id,
category3_id,
create_time
FROM sku_info
WHERE 1=1 #全量导入,恒等,防止sql注入
"
}
import_user_info(){
import_data "user_info" "
SELECT
id,
name,
birthday,
gender,
email,
user_level,
create_time
FROM user_info
WHERE 1=1
"
}
import_base_category1(){
import_data "base_category1" "
SELECT
id,
name
FROM base_category1
WHERE 1=1
"
}
import_base_category2(){
import_data "base_category2" "
SELECT
id,
name
FROM base_category2
WHERE 1=1
"
}
import_base_category3(){
import_data "base_category3" "
SELECT
id,
name
FROM base_category3
WHERE 1=1
"
}
import_order_detail(){
import_data "order_detail" "
SELECT
od.id,
order_id,
user_id,
sku_id,
sku_name,
order_price,
sku_num,
o.create_time
FROM order_detail
WHERE
o.id=od.order_id
AND DATE_FORMAT(create_time,'%Y-%m-%d')='${db_date}'
"
}
import_payment_info(){
import_data "payment_info" "
SELECT
id,
out_trade_no,
order_id,
user_id,
alipay_trade_no,
total_amount,
subject,
payment_type,
payment_time
FROM payment_info
WHERE DATE_FORMAT(create_time,'%Y-%m-%d')='${db_date}'
"
}
import_order_info(){
import_data "order_info" "
SELECT
id,
total_amount,
order_status,
user_id,
payment_way,
out_trade_no,
create_time,
operate_tome
FROM order_info
WHERE
DATE_FORMAT(create_time,'%Y-%m-%d')='${db_date}'
OR DATE_FORMAT(operate_tome,'%Y-%m-%d')='${db_date}'
"
}
case $l in
"base_category1")
import_base_category1
;;
"base_category2")
import_base_category2
;;
"base_category3")
import_base_category3
;;
"order_info")
import_order_info
;;
"order_detail")
import_order_detail
;;
"sku_info")
import_sku_info
;;
"user_info")
import_user_info
;;
"paryment_info")
import_paryment_info
;;
"order_info")
import_order_info
;;
"all")
import_base_category1
import_base_category2
import_base_category3
import_order_info
import_order_detail
import_sku_info
import_user_info
import_paryment_info
import_order_info
;;
esac
#执行语句:sqoop imoprt.sh all 2020-07-05
3 在仓库中构建对应表
3.1 创建ODS表
DROP TABLE IF EXISTS ods_order_info;
CREATE TABLE ods_order_info(
`id` STRING COMMENT '订单编号',
`total_amount` STRING COMMENT '订单金额',
`order_status` STRING COMMENT '订单状态',
`user_id` STRING COMMENT '用户id',
`payment_way` STRING COMMENT '支付方式',
`out_trade_no` STRING COMMENT '支付流水号',
`create_time` STRING COMMENT '创建时间',
`operate_time` STRING COMMENT '操作时间'
)COMMENT '订单表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_order_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_order_detail;
CREATE TABLE ods_order_info(
`id` STRING COMMENT '订单编号',
`order_id` STRING COMMENT '订单号',
`user_id` STRING COMMENT '用户id',
`sku_id` STRING COMMENT '商品id',
`sku_name` STRING COMMENT '商品名称',
`order_price` STRING COMMENT '下单价格',
`sku_num` STRING COMMENT '商品数量',
`create_time` STRING COMMENT '创建时间'
)COMMENT '订单明细表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_order_detail'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_sku_info;
CREATE TABLE ods_order_info(
`id` STRING COMMENT 'skuid',
`spu_id` STRING COMMENT 'spuId',
`price` STRING COMMENT '价格',
`sku_name` STRING COMMENT '商品名称',
`sku_desc` STRING COMMENT '商品描述',
`weight` STRING COMMENT '重量',
`tm_id` STRING COMMENT '品牌id',
`category3_id` STRING COMMENT '品类id',
`create_time` STRING COMMENT '创建时间'
)COMMENT '商品表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_sku_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_user_info;
CREATE TABLE ods_user_info(
`id` STRING COMMENT '用户id',
`name` STRING COMMENT '姓名',
`birthday` STRING COMMENT '生日',
`gender` STRING COMMENT '性别',
`email` STRING COMMENT '邮箱',
`user_level` STRING COMMENT '用户等级',
`create_time` STRING COMMENT '创建时间'
)COMMENT '用户表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_user_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_base_category1;
CREATE TABLE ods_base_category1(
`id` STRING COMMENT 'id',
`total_amount` STRING COMMENT '名称'
)COMMENT '商品一级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category1'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_base_category2;
CREATE TABLE ods_base_category2(
`id` STRING COMMENT 'id',
`total_amount` STRING COMMENT '名称',
category1_id STRING COMMENT '一级品类id'
)COMMENT '商品二级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category2'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_base_category3;
CREATE TABLE ods_base_category3(
`id` STRING COMMENT 'id',
`total_amount` STRING COMMENT '名称',
category1_id STRING COMMENT '二级品类id'
)COMMENT '商品三级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category3'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_payment_info;
CREATE TABLE ods_payment_info(
`id` STRING COMMENT '编号',
`out_trade_no` STRING COMMENT '对外业务编号',
`order_id` STRING COMMENT '订单编号',
`user_id` STRING COMMENT '用户编号',
`alipay_trade_no` STRING COMMENT '支付宝交易流水编号',
`total_amount` STRING COMMENT '支付金额',
`subject` STRING COMMENT '交易内容',
`payment_type` STRING COMMENT '支付类型',
`payment_time` STRING COMMENT '支付时间'
)COMMENT '订单表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_payment_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;
3.2 将数据导入表中
#!/bin/bash
do_date=$1
hql="
LOAD DATA INPATH '/origin_data/gmall/order_info/${do_date} OVERWRITE INTO TABLE gmall.ods_order_info PARTITION(dt='${do_date}');
LOAD DATA INPATH '/origin_data/gmall/order_detail/${do_date} OVERWRITE INTO TABLE gmall.ods_order_detail PARTITION(dt='${do_date}');
LOAD DATA INPATH '/origin_data/gmall/sku_info/${do_date} OVERWRITE INTO TABLE gmall.ods_sku_info PARTITION(dt='${do_date}');
LOAD DATA INPATH '/origin_data/gmall/user_info/${do_date} OVERWRITE INTO TABLE gmall.ods_user_info PARTITION(dt='${do_date}');
LOAD DATA INPATH '/origin_data/gmall/payment_info/${do_date} OVERWRITE INTO TABLE gmall.ods_payment_info PARTITION(dt='${do_date}');
LOAD DATA INPATH '/origin_data/gmall/base_category1/${do_date} OVERWRITE INTO TABLE gmall.ods_base_category1 PARTITION(dt='${do_date}');
LOAD DATA INPATH '/origin_data/gmall/base_category2/${do_date} OVERWRITE INTO TABLE gmall.ods_base_category2 PARTITION(dt='${do_date}');
LOAD DATA INPATH '/origin_data/gmall/base_category3/${do_date} OVERWRITE INTO TABLE gmall.ods_base_category3 PARTITION(dt='${do_date}');
"
hive -e "${hql}"
3.3 DWD层
基于ODS层对数据进行判空过滤,对商品分类表进行维度退化(降维)
PS:如果被退化的维度,还有其他业务表使用,退化后处理起来,会相对麻烦些,再实际业务中,还有时间、商品分类、地域等可以进行维度退化
3.4 创建DWD表
DROP TABLE IF EXISTS dwd_order_info;
CREATE TABLE dwd_order_info(
`id` STRING COMMENT '',
`total_amount` DECIMAL(10,2) COMMENT '',
`order_status` STRING COMMENT '1 2 3 4 5',
`user_id` STRING COMMENT 'id',
`payment_way` STRING COMMENT '',
`out_trade_no` STRING COMMENT '',
`create_time` STRING COMMENT '',
`operate_time` STRING COMMENT ''
)COMMENT '订单表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/dwd_order_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS dwd_order_detail;
CREATE TABLE dwd_order_detail(
`id` STRING COMMENT '',
`order_id` STRING COMMENT '',
`user_id` STRING COMMENT 'id',
`sku_id` STRING COMMENT 'id',
`sku_name` STRING COMMENT '',
`order_price` STRING COMMENT '',
`sku_num` STRING COMMENT '',
`create_time` STRING COMMENT ''
)COMMENT '订单明细表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/dwd_order_detail'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS dwd_user_info;
CREATE TABLE dwd_user_info(
`id` STRING COMMENT 'id',
`name` STRING COMMENT '',
`birthday` STRING COMMENT '',
`gender` STRING COMMENT '',
`email` STRING COMMENT '',
`user_level` STRING COMMENT '',
`create_time` STRING COMMENT ''
)COMMENT '用户表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/dwd_user_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS dwd_payment_info;
CREATE TABLE dwd_payment_info(
`id` STRING COMMENT '',
`out_trade_no` STRING COMMENT '',
`order_id` STRING COMMENT '',
`user_id` STRING COMMENT '',
`alipay_trade_no` STRING COMMENT '',
`total_amount` STRING COMMENT '',
`subject` STRING COMMENT '',
`payment_type` STRING COMMENT '',
`payment_time` STRING COMMENT ''
)COMMENT '支付流水表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/dwd_payment_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS dwd_sku_info;
CREATE TABLE ods_order_info(
`id` STRING COMMENT 'skuid',
`spu_id` STRING COMMENT 'spuId',
`price` STRING COMMENT '价格',
`sku_name` STRING COMMENT '商品名称',
`sku_desc` STRING COMMENT '商品描述',
`weight` STRING COMMENT '重量',
`tm_id` STRING COMMENT '品牌id',
`category3_id` STRING COMMENT '3id',
`category2_id` STRING COMMENT '2id',
`category1_id` STRING COMMENT '1id',
`category3_name` STRING COMMENT '3',
`category2_name` STRING COMMENT '2',
`category1_name` STRING COMMENT '1',
`create_time` STRING COMMENT '创建时间'
)COMMENT '商品表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/ods_sku_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_base_category1;
CREATE TABLE ods_base_category1(
`id` STRING COMMENT 'id',
`total_amount` STRING COMMENT '名称'
)COMMENT '商品一级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category1'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_base_category2;
CREATE TABLE ods_base_category2(
`id` STRING COMMENT 'id',
`total_amount` STRING COMMENT '名称',
category1_id STRING COMMENT '一级品类id'
)COMMENT '商品二级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category2'
TBLPROPERTIES ("parquet.compression"="snappy")
;
DROP TABLE IF EXISTS ods_base_category3;
CREATE TABLE ods_base_category3(
`id` STRING COMMENT 'id',
`total_amount` STRING COMMENT '名称',
category1_id STRING COMMENT '二级品类id'
)COMMENT '商品三级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category3'
TBLPROPERTIES ("parquet.compression"="snappy")
;
3.5 DWD层导入数据
#!/bin/bash
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
hql="
SET hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE gmall.dwd_order_info PARTITION(dt)
SELECT
*
FROM gmall.ods_order_info
WHERE
dt='${log_date}'
AND id IS NOT NULL;
INSERT OVERWRITE TABLE gmall.dwd_order_detail PARTITION(dt)
SELECT
*
FROM gmall.ods_order_detail
WHERE
dt='${log_date}'
AND id IS NOT NULL;
INSERT OVERWRITE TABLE gmall.dwd_user_info PARTITION(dt)
SELECT
*
FROM gmall.ods_user_info
WHERE
dt='${log_date}'
AND id IS NOT NULL;
INSERT OVERWRITE TABLE gmall.dwd_payment_info PARTITION(dt)
SELECT
*
FROM gmall.ods_payment_info
WHERE
dt='${log_date}'
AND id IS NOT NULL;
INSERT OVERWRITE TABLE gmall.dwd_sku_info PARTITION(dt)
SELECT
sku.id,
sku.spu_id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.tm_id,
sku.category3_id,
c2.id category2_id,
c1.id category1_id,
c3.id category3_name,
c2.id category2_name,
c1.id category1_name,
sku.create_time,
sku.dt
FROM
gmall.ods_sku_info sku
JOIN gmall.ods_base_category3 c3 ON sku.category3_id=c3.id
JOIN gmall.ods_base_category2 c2 ON sku.category2_id=c3.id
JOIN gmall.ods_base_category1 c1 ON sku.category1_id=c3.id
WHERE
sku.dt='${log_date}'
AND c2.dt='${log_date}'
AND c3.dt='${log_date}'
AND c1.dt='${log_date}'
AND sku.id IS NOT NULL;
"
hive -e ${hql}
3.6 DWS
3.6.1 创建用户行为宽表
把每个用户单日的行为聚合起来组成一张多列宽表,以便以后关联用户维度信息后进行不同角度的统计分析
DROP TABLE IF EXISTS dws_user_action;
CREATE EXTERNAL TABLE dws_user_action
(
user_id STRING COMMENT'用户id',
order_count BIGINT COMMENT'下单次数',
order_amount DECIMAL(16,2) COMMENT'下单金额',
payment_count BIGINT COMMENT'支付次数',
payment_amount DECIMAL(16,2) COMMENT'支付金额',
comment_count BIGINT COMMENT'评论次数'
)COMMENT '每日用户行为宽表'
PARTITIONED BY (`dt` STRING)
LOCATION '/warehouse/gmall/dws/dws_user_action'
TBLPROPERTIES ("parquet.compression"="snappy");
WITH
tmp_order AS(
SELECT
user_id,
SUM(oc.total_amount) order_amount,
COUNT(*) order_count
FROM dwd_order_info oc
WHERE date_format(oc.create_time,'yyyy-MM-dd')='2019-02-10'
GROUP BY user_id
),
tmp_payment AS(
SELECT
user_id,
SUM(pi.total_amount) order_amount,
COUNT(*) order_count
FROM dwd_payment_info pi
WHERE date_format(pi.create_time,'yyyy-MM-dd')='2019-02-10'
GROUP BY user_id
),
tmp_comment AS(
SELECT
user_id,
COUNT(*) comment_count_count
FROM dwd_comment_log c
WHERE date_format(c.dt,'yyyy-MM-dd')='2019-02-10'
GROUP BY user_id
)
INSERT OVERWRITE TABLE dws_user_action PARTITION(dt='2019-02-10')
SELECT
user_actions.user_id,
SUM(user_action.order_count),
SUM(user_action.order_amount),
SUM(user_action.payment_count),
SUM(user_action.payment_amount),
SUM(user_action.coment_count)
FROM
(
SELECT
user_id,
order_count,
order_amount,
0 AS payment_count,
0 AS payment_amount,
0 AS comment_count
FROM tmp_order
UNION ALL
SELECT
user_id,
0 AS order_count,
0 AS order_amount,
payment_count,
payment_amount,
0 AS coment_count
FROM tmp_payment
UNION ALL
SELECT
user_id,
0 AS order_count,
0 AS order_amount,
0 AS payment_count,
0 AS payment_amount,
coment_count
FROM tmp_payment
) user_action
GROUP BY user_id;