(从电商项目认识数仓体系九)业务数仓搭建

本文介绍了从电商项目中获取业务数据并导入数仓的过程,包括表的同步策略、使用Sqoop进行数据导入,以及在数仓中构建ODS、DWD和DWS层的详细步骤。特别地,重点讲述了DWD层的维度退化和DWS层中用户行为宽表的创建,为后续的统计分析做准备。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1 业务数据

业务数据一般存入mysql中,过程较为单一简单,不再赘述

2 业务数据导入数仓

2.1 表的同步策略

序号表名解释同步策略
1order_info订单表新增及变化
2order_detail订单详情增量
3sku_info商品表全量
4user_info用户表全量
5base_category1商品一级分类表全量
6base_category2商品二级分类表全量
7base_category3商品三级分类表全量
8payment_info支付流水表增量

 

 

 

 

 

 

 

 

 

2.2 Sqoop导入脚本

#!/bin/bash

db_date=$2
echo ${db_date}
db_name=gmall

import_data(){
/opt/module/sqoop/bin/sqoop import \
--connect jdbc:mysql://hadoop:3306/${db_name} \
--username root \
--password 000000 \
--target-dir /origin_data/${db_name}/db/$1/${db_date} \
--delete-target-dir \
--num-mappers 1 \
--fields-terminated-by "\t" \
--query "$2"' and ${conditions};'
}

import_sku_info(){
    import_data "sku_info" "
    SELECT
        id,
        spu_id,
        price,
        sku_name,
        sku_desc,
        weight,
        tm_id,
        category3_id,
        create_time
    FROM sku_info
    WHERE 1=1 #全量导入,恒等,防止sql注入
    "
}

import_user_info(){
    import_data "user_info" "
    SELECT
        id,
        name,
        birthday,
        gender,
        email,
        user_level,
        create_time
    FROM user_info
    WHERE 1=1
    "
}

import_base_category1(){
    import_data "base_category1" "
    SELECT
        id,
        name
    FROM base_category1
    WHERE 1=1
    "
}

import_base_category2(){
    import_data "base_category2" "
    SELECT
        id,
        name
    FROM base_category2
    WHERE 1=1
    "
}

import_base_category3(){
    import_data "base_category3" "
    SELECT
        id,
        name
    FROM base_category3
    WHERE 1=1
    "
}

import_order_detail(){
    import_data "order_detail" "
    SELECT
        od.id,
        order_id,
        user_id,
        sku_id,
        sku_name,
        order_price,
        sku_num,
        o.create_time
    FROM order_detail
    WHERE 
    o.id=od.order_id
    AND DATE_FORMAT(create_time,'%Y-%m-%d')='${db_date}'
    "
}

import_payment_info(){
    import_data "payment_info" "
    SELECT
        id,
        out_trade_no,
        order_id,
        user_id,
        alipay_trade_no,
        total_amount,
        subject,
        payment_type,
        payment_time
    FROM payment_info
    WHERE DATE_FORMAT(create_time,'%Y-%m-%d')='${db_date}'
    "
}

import_order_info(){
    import_data "order_info" "
    SELECT
        id,
        total_amount,
        order_status,
        user_id,
        payment_way,
        out_trade_no,
        create_time,
        operate_tome
    FROM order_info
    WHERE 
    DATE_FORMAT(create_time,'%Y-%m-%d')='${db_date}'
    OR DATE_FORMAT(operate_tome,'%Y-%m-%d')='${db_date}'
    "
}

case $l in
    "base_category1")
        import_base_category1
;;
    "base_category2")
        import_base_category2
;;
    "base_category3")
        import_base_category3
;;
    "order_info")
        import_order_info
;;
    "order_detail")
        import_order_detail
;;
    "sku_info")
        import_sku_info
;;
    "user_info")
        import_user_info
;;
    "paryment_info")
        import_paryment_info
;;
    "order_info")
        import_order_info
;;
    "all")
        import_base_category1
        import_base_category2
        import_base_category3
        import_order_info
        import_order_detail
        import_sku_info
        import_user_info
        import_paryment_info
        import_order_info
;;
esac

#执行语句:sqoop imoprt.sh all 2020-07-05

3 在仓库中构建对应表

3.1 创建ODS表

DROP TABLE IF EXISTS ods_order_info;
CREATE TABLE ods_order_info(
    `id` STRING COMMENT '订单编号',
    `total_amount` STRING COMMENT '订单金额',
    `order_status` STRING COMMENT '订单状态',
    `user_id` STRING COMMENT '用户id',
    `payment_way` STRING COMMENT '支付方式',
    `out_trade_no` STRING COMMENT '支付流水号',
    `create_time` STRING COMMENT '创建时间',
    `operate_time` STRING COMMENT '操作时间'
)COMMENT '订单表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_order_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS ods_order_detail;
CREATE TABLE ods_order_info(
    `id` STRING COMMENT '订单编号',
    `order_id` STRING COMMENT '订单号',
    `user_id` STRING COMMENT '用户id',
    `sku_id` STRING COMMENT '商品id',
    `sku_name` STRING COMMENT '商品名称',
    `order_price` STRING COMMENT '下单价格',
    `sku_num` STRING COMMENT '商品数量',
    `create_time` STRING COMMENT '创建时间'
)COMMENT '订单明细表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_order_detail'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS ods_sku_info;
CREATE TABLE ods_order_info(
    `id` STRING COMMENT 'skuid',
    `spu_id` STRING COMMENT 'spuId',
    `price` STRING COMMENT '价格',
    `sku_name` STRING COMMENT '商品名称',
    `sku_desc` STRING COMMENT '商品描述',
    `weight` STRING COMMENT '重量',
    `tm_id` STRING COMMENT '品牌id',
    `category3_id` STRING COMMENT '品类id',
    `create_time` STRING COMMENT '创建时间'
)COMMENT '商品表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_sku_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS ods_user_info;
CREATE TABLE ods_user_info(
    `id` STRING COMMENT '用户id',
    `name` STRING COMMENT '姓名',
    `birthday` STRING COMMENT '生日',
    `gender` STRING COMMENT '性别',
    `email` STRING COMMENT '邮箱',
    `user_level` STRING COMMENT '用户等级',
    `create_time` STRING COMMENT '创建时间'
)COMMENT '用户表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_user_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS ods_base_category1;
CREATE TABLE ods_base_category1(
    `id` STRING COMMENT 'id',
    `total_amount` STRING COMMENT '名称'
)COMMENT '商品一级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category1'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS ods_base_category2;
CREATE TABLE ods_base_category2(
    `id` STRING COMMENT 'id',
    `total_amount` STRING COMMENT '名称',
    category1_id STRING COMMENT '一级品类id'
)COMMENT '商品二级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category2'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS ods_base_category3;
CREATE TABLE ods_base_category3(
    `id` STRING COMMENT 'id',
    `total_amount` STRING COMMENT '名称',
    category1_id STRING COMMENT '二级品类id'
)COMMENT '商品三级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category3'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS ods_payment_info;
CREATE TABLE ods_payment_info(
    `id` STRING COMMENT '编号',
    `out_trade_no` STRING COMMENT '对外业务编号',
    `order_id` STRING COMMENT '订单编号',
    `user_id` STRING COMMENT '用户编号',
    `alipay_trade_no` STRING COMMENT '支付宝交易流水编号',
    `total_amount` STRING COMMENT '支付金额',
    `subject` STRING COMMENT '交易内容',
    `payment_type` STRING COMMENT '支付类型',
    `payment_time` STRING COMMENT '支付时间'
)COMMENT '订单表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_payment_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;

3.2 将数据导入表中

#!/bin/bash

do_date=$1

hql="
LOAD DATA INPATH '/origin_data/gmall/order_info/${do_date} OVERWRITE INTO TABLE gmall.ods_order_info PARTITION(dt='${do_date}');

LOAD DATA INPATH '/origin_data/gmall/order_detail/${do_date} OVERWRITE INTO TABLE gmall.ods_order_detail PARTITION(dt='${do_date}');

LOAD DATA INPATH '/origin_data/gmall/sku_info/${do_date} OVERWRITE INTO TABLE gmall.ods_sku_info PARTITION(dt='${do_date}');

LOAD DATA INPATH '/origin_data/gmall/user_info/${do_date} OVERWRITE INTO TABLE gmall.ods_user_info PARTITION(dt='${do_date}');

LOAD DATA INPATH '/origin_data/gmall/payment_info/${do_date} OVERWRITE INTO TABLE gmall.ods_payment_info PARTITION(dt='${do_date}');

LOAD DATA INPATH '/origin_data/gmall/base_category1/${do_date} OVERWRITE INTO TABLE gmall.ods_base_category1 PARTITION(dt='${do_date}');

LOAD DATA INPATH '/origin_data/gmall/base_category2/${do_date} OVERWRITE INTO TABLE gmall.ods_base_category2 PARTITION(dt='${do_date}');

LOAD DATA INPATH '/origin_data/gmall/base_category3/${do_date} OVERWRITE INTO TABLE gmall.ods_base_category3 PARTITION(dt='${do_date}');
"

hive -e "${hql}"

3.3 DWD层

基于ODS层对数据进行判空过滤,对商品分类表进行维度退化(降维)

PS:如果被退化的维度,还有其他业务表使用,退化后处理起来,会相对麻烦些,再实际业务中,还有时间、商品分类、地域等可以进行维度退化

3.4 创建DWD表

DROP TABLE IF EXISTS dwd_order_info;
CREATE TABLE dwd_order_info(
    `id` STRING COMMENT '',
    `total_amount` DECIMAL(10,2) COMMENT '',
    `order_status` STRING COMMENT '1 2 3 4 5',
    `user_id` STRING COMMENT 'id',
    `payment_way` STRING COMMENT '',
    `out_trade_no` STRING COMMENT '',
    `create_time` STRING COMMENT '',
    `operate_time` STRING COMMENT ''
)COMMENT '订单表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/dwd_order_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS dwd_order_detail;
CREATE TABLE dwd_order_detail(
    `id` STRING COMMENT '',
    `order_id` STRING COMMENT '',
    `user_id` STRING COMMENT 'id',
    `sku_id` STRING COMMENT 'id',
    `sku_name` STRING COMMENT '',
    `order_price` STRING COMMENT '',
    `sku_num` STRING COMMENT '',
    `create_time` STRING COMMENT ''
)COMMENT '订单明细表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/dwd_order_detail'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS dwd_user_info;
CREATE TABLE dwd_user_info(
    `id` STRING COMMENT 'id',
    `name` STRING COMMENT '',
    `birthday` STRING COMMENT '',
    `gender` STRING COMMENT '',
    `email` STRING COMMENT '',
    `user_level` STRING COMMENT '',
    `create_time` STRING COMMENT ''
)COMMENT '用户表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/dwd_user_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS dwd_payment_info;
CREATE TABLE dwd_payment_info(
    `id` STRING COMMENT '',
    `out_trade_no` STRING COMMENT '',
    `order_id` STRING COMMENT '',
    `user_id` STRING COMMENT '',
    `alipay_trade_no` STRING COMMENT '',
    `total_amount` STRING COMMENT '',
    `subject` STRING COMMENT '',
    `payment_type` STRING COMMENT '',
    `payment_time` STRING COMMENT ''
)COMMENT '支付流水表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/dwd_payment_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS dwd_sku_info;
CREATE TABLE ods_order_info(
    `id` STRING COMMENT 'skuid',
    `spu_id` STRING COMMENT 'spuId',
    `price` STRING COMMENT '价格',
    `sku_name` STRING COMMENT '商品名称',
    `sku_desc` STRING COMMENT '商品描述',
    `weight` STRING COMMENT '重量',
    `tm_id` STRING COMMENT '品牌id',
    `category3_id` STRING COMMENT '3id',
    `category2_id` STRING COMMENT '2id',
    `category1_id` STRING COMMENT '1id',
    `category3_name` STRING COMMENT '3',
    `category2_name` STRING COMMENT '2',
    `category1_name` STRING COMMENT '1',
    `create_time` STRING COMMENT '创建时间'
)COMMENT '商品表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/dwd/ods_sku_info'
TBLPROPERTIES ("parquet.compression"="snappy")
;


DROP TABLE IF EXISTS ods_base_category1;
CREATE TABLE ods_base_category1(
    `id` STRING COMMENT 'id',
    `total_amount` STRING COMMENT '名称'
)COMMENT '商品一级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category1'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS ods_base_category2;
CREATE TABLE ods_base_category2(
    `id` STRING COMMENT 'id',
    `total_amount` STRING COMMENT '名称',
    category1_id STRING COMMENT '一级品类id'
)COMMENT '商品二级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category2'
TBLPROPERTIES ("parquet.compression"="snappy")
;

DROP TABLE IF EXISTS ods_base_category3;
CREATE TABLE ods_base_category3(
    `id` STRING COMMENT 'id',
    `total_amount` STRING COMMENT '名称',
    category1_id STRING COMMENT '二级品类id'
)COMMENT '商品三级分类'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/warehouse/gmall/ods/ods_base_category3'
TBLPROPERTIES ("parquet.compression"="snappy")
;

3.5 DWD层导入数据

#!/bin/bash

if [ -n $1 ] ;then
    log_date=$1
else
    log_date=`date -d "-1 day" +%F`
fi

hql="
SET hive.exec.dynamic.partition.mode=nonstrict;

INSERT OVERWRITE TABLE gmall.dwd_order_info PARTITION(dt)
SELECT 
    * 
FROM gmall.ods_order_info
WHERE 
    dt='${log_date}'
    AND id IS NOT NULL;

INSERT OVERWRITE TABLE gmall.dwd_order_detail PARTITION(dt)
SELECT 
    * 
FROM gmall.ods_order_detail
WHERE 
    dt='${log_date}'
    AND id IS NOT NULL;

INSERT OVERWRITE TABLE gmall.dwd_user_info PARTITION(dt)
SELECT 
    * 
FROM gmall.ods_user_info
WHERE 
    dt='${log_date}'
    AND id IS NOT NULL;

INSERT OVERWRITE TABLE gmall.dwd_payment_info PARTITION(dt)
SELECT 
    * 
FROM gmall.ods_payment_info
WHERE 
    dt='${log_date}'
    AND id IS NOT NULL;

INSERT OVERWRITE TABLE gmall.dwd_sku_info PARTITION(dt)
SELECT 
    sku.id,
    sku.spu_id,
    sku.price,
    sku.sku_name,
    sku.sku_desc,
    sku.weight,
    sku.tm_id,
    sku.category3_id,
    c2.id category2_id,
    c1.id category1_id,
    c3.id category3_name,
    c2.id category2_name,
    c1.id category1_name,
    sku.create_time,
    sku.dt
FROM 
    gmall.ods_sku_info sku
JOIN gmall.ods_base_category3 c3 ON sku.category3_id=c3.id
JOIN gmall.ods_base_category2 c2 ON sku.category2_id=c3.id
JOIN gmall.ods_base_category1 c1 ON sku.category1_id=c3.id
WHERE 
    sku.dt='${log_date}'
    AND c2.dt='${log_date}'
    AND c3.dt='${log_date}'
    AND c1.dt='${log_date}'
    AND sku.id IS NOT NULL;
"

hive -e ${hql}

3.6 DWS

3.6.1 创建用户行为宽表

把每个用户单日的行为聚合起来组成一张多列宽表,以便以后关联用户维度信息后进行不同角度的统计分析

DROP TABLE IF EXISTS dws_user_action;
CREATE EXTERNAL TABLE dws_user_action
(
    user_id STRING COMMENT'用户id',
    order_count BIGINT COMMENT'下单次数',
    order_amount DECIMAL(16,2) COMMENT'下单金额',
    payment_count BIGINT COMMENT'支付次数',
    payment_amount DECIMAL(16,2) COMMENT'支付金额',
    comment_count BIGINT COMMENT'评论次数'
)COMMENT '每日用户行为宽表'
PARTITIONED BY (`dt` STRING)
LOCATION '/warehouse/gmall/dws/dws_user_action'
TBLPROPERTIES ("parquet.compression"="snappy");

WITH 
tmp_order AS(
    SELECT
        user_id,
        SUM(oc.total_amount) order_amount,
        COUNT(*) order_count
    FROM dwd_order_info oc
    WHERE date_format(oc.create_time,'yyyy-MM-dd')='2019-02-10'
    GROUP BY user_id
),
tmp_payment AS(
    SELECT
        user_id,
        SUM(pi.total_amount) order_amount,
        COUNT(*) order_count
    FROM dwd_payment_info pi
    WHERE date_format(pi.create_time,'yyyy-MM-dd')='2019-02-10'
    GROUP BY user_id
),
tmp_comment AS(
    SELECT
        user_id,
        COUNT(*) comment_count_count
    FROM dwd_comment_log c
    WHERE date_format(c.dt,'yyyy-MM-dd')='2019-02-10'
    GROUP BY user_id
)
INSERT OVERWRITE TABLE dws_user_action PARTITION(dt='2019-02-10')
SELECT
    user_actions.user_id,
    SUM(user_action.order_count),
    SUM(user_action.order_amount),
    SUM(user_action.payment_count),
    SUM(user_action.payment_amount),
    SUM(user_action.coment_count)
FROM
(
    SELECT
        user_id,
        order_count,
        order_amount,
        0 AS payment_count,
        0 AS payment_amount,
        0 AS comment_count
    FROM tmp_order
    UNION ALL
    SELECT
        user_id,
        0 AS order_count,
        0 AS order_amount,
        payment_count,
        payment_amount,
        0 AS coment_count
    FROM tmp_payment
    UNION ALL
    SELECT
        user_id,
        0 AS order_count,
        0 AS order_amount,
        0 AS payment_count,
        0 AS payment_amount,
        coment_count
    FROM tmp_payment
) user_action
GROUP BY user_id;

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值