离线数仓
1、数仓搭建之ODS层
1.1、创建数据库
1)创建gmall数据库
hive (default)> create database gmall;
2)使用gmall数据库
hive (default)> use gmall;
1.2、创建启动日志表ods_start_log
1)创建输入数据是lzo输出是text,支持json解析的分区表
hive (gmall)>
drop table if exists ods_start_log;
CREATE EXTERNAL TABLE ods_start_log (line
string)
PARTITIONED BY (dt
string)
STORED AS
INPUTFORMAT ‘com.hadoop.mapred.DeprecatedLzoTextInputFormat’
OUTPUTFORMAT ‘org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat’
LOCATION ‘/warehouse/gmall/ods/ods_start_log’;
2)加载数据
hive (gmall)>
load data inpath ‘/origin_data/gmall/log/topic_start/2021-08-01’ into table gmall.ods_start_log partition(dt=‘2021-08-01’);
3)为lzo压缩文件创建索引
hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /warehouse/gmall/ods/ods_start_log/dt=2021-08-01
1.3、创建事件日志表ods_event_log
1)创建输入数据是lzo输出是text,支持json解析的分区表
hive (gmall)>
drop table if exists ods_event_log;
CREATE EXTERNAL TABLE ods_event_log(line
string)
PARTITIONED BY (dt
string)
STORED AS
INPUTFORMAT ‘com.hadoop.mapred.DeprecatedLzoTextInputFormat’
OUTPUTFORMAT ‘org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat’
LOCATION ‘/warehouse/gmall/ods/ods_event_log’;
2)加载数据
hive (gmall)>
load data inpath ‘/origin_data/gmall/log/topic_event/2021-08-01’ into table gmall.ods_event_log partition(dt=‘2021-08-01’);
3)为lzo压缩文件创建索引
hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /warehouse/gmall/ods/ods_event_log/dt=2021-08-01
1.4、ODS层加载数据脚本
1)在hadoop103的/opt/module/hive/myscripts目录下创建脚本
[atguigu@hadoop103 myscripts]$ vim ods_log.sh
在脚本中编写如下内容:
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
APP=gmall
sql="
load data inpath '/origin_data/gmall/log/topic_start/$do_date' into table $APP.ods_start_log partition(dt='$do_date');
load data inpath '/origin_data/gmall/log/topic_event/$do_date' into table $APP.ods_event_log partition(dt='$do_date');"
hive -e "$sql"
hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /warehouse/gmall/ods/ods_start_log/dt=$do_date
hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /warehouse/gmall/ods/ods_event_log/dt=$do_date
说明1:
[ -n 变量值 ] 判断变量的值,是否为空
– 变量的值,非空,返回true
– 变量的值,为空,返回false
– 一定要为判断的变量名加上双引号
说明2:
查看date命令的使用,[atguigu@hadoop103 ~]$ date --help
2)增加脚本执行权限
[atguigu@hadoop103 myscripts]$ chmod u+x ods_log.sh
3)脚本使用
[atguigu@hadoop103 myscripts]$ bash ods_log.sh 2021-08-03
2、数仓搭建之DWD层
对ODS层数据进行清洗(去除空值,脏数据,超过极限范围的数据,行式存储改为列存储,改压缩格式)。
2.1、DWD层启动表数据解析
get_json_object 函数
———————————————————————————
get_json_object(json_txt, path) - Extract a json object from path
Extract json object from a json string based on json path specified, and return json string of the extracted json object. It will return null if the input json string is invalid.
A limited version of JSONPath supported:
$ : Root object 代表整个JSON对象
. : Child operator 代表获取JSON对象中子元素(属性)的操作符
[] : Subscript operator for array 获取JSONArray中的某个元素
获取18:
select get_json_object(’{“name”:“jack”,“age”:18,“parents”:
[{“name”:“oldjack”,“age”:48},{“name”:“jackmom”,“age”:49}]}’,’$.age’)
获取49:
select get_json_object(’{“name”:“jack”,“age”:18,“parents”:
[{“name”:“oldjack”,“age”:48},{“name”:“jackmom”,“age”:49}]}’,’$.age’)
1)建表语句
hive (gmall)>
drop table if exists dwd_start_log;
CREATE EXTERNAL TABLE dwd_start_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`entry` string,
`open_ad_type` string,
`action` string,
`loading_time` string,
`detail` string,
`extend1` string
)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_start_log/'
TBLPROPERTIES('parquet.compression'='lzo');
2)向启动表导入数据
hive (gmall)>
insert overwrite table dwd_start_log
PARTITION (dt='2021-08-03')
select
get_json_object(line,'$.mid') mid_id,
get_json_object(line,'$.uid') user_id,
get_json_object(line,'$.vc') version_code,
get_json_object(line,'$.vn') version_name,
get_json_object(line,'$.l') lang,
get_json_object(line,'$.sr') source,
get_json_object(line,'$.os') os,
get_json_object(line,'$.ar') area,
get_json_object(line,'$.md') model,
get_json_object(line,'$.ba') brand,
get_json_object(line,'$.sv') sdk_version,
get_json_object(line,'$.g') gmail,
get_json_object(line,'$.hw') height_width,
get_json_object(line,'$.t') app_time,
get_json_object(line,'$.nw') network,
get_json_object(line,'$.ln') lng,
get_json_object(line,'$.la') lat,
get_json_object(line,'$.entry') entry,
get_json_object(line,'$.open_ad_type') open_ad_type,
get_json_object(line,'$.action') action,
get_json_object(line,'$.loading_time') loading_time,
get_json_object(line,'$.detail') detail,
get_json_object(line,'$.extend1') extend1
from ods_start_log
where dt='2021-08-03';
2.2、DWD层启动表加载数据脚本
1)在hadoop103的/opt/module/myscripts目录下创建脚本
[atguigu@hadoop103 myscripts]$ vim dwd_start_log.sh
在脚本中编写如下内容
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
insert overwrite table gmall.dwd_start_log
PARTITION (dt='$do_date')
select
get_json_object(line,'$.mid') mid_id,
get_json_object(line,'$.uid') user_id,
get_json_object(line,'$.vc') version_code,
get_json_object(line,'$.vn') version_name,
get_json_object(line,'$.l') lang,
get_json_object(line,'$.sr') source,
get_json_object(line,'$.os') os,
get_json_object(line,'$.ar') area,
get_json_object(line,'$.md') model,
get_json_object(line,'$.ba') brand,
get_json_object(line,'$.sv') sdk_version,
get_json_object(line,'$.g') gmail,
get_json_object(line,'$.hw') height_width,
get_json_object(line,'$.t') app_time,
get_json_object(line,'$.nw') network,
get_json_object(line,'$.ln') lng,
get_json_object(line,'$.la') lat,
get_json_object(line,'$.entry') entry,
get_json_object(line,'$.open_ad_type') open_ad_type,
get_json_object(line,'$.action') action,
get_json_object(line,'$.loading_time') loading_time,
get_json_object(line,'$.detail') detail,
get_json_object(line,'$.extend1') extend1
from gmall.ods_start_log
where dt='$do_date';
"
hive -e "$sql"
2)增加脚本执行权限
[atguigu@hadoop103 myscripts]$ chmod u+x dwd_start_log.sh
3)脚本使用
[atguigu@hadoop103 myscripts]$ dwd_start_log.sh 2021-08-03
2.3、DWD层事件表数据解析
1)创建事件日志基础明细表
hive (gmall)>
drop table if exists dwd_base_event_log;
CREATE EXTERNAL TABLE dwd_base_event_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`event_name` string,
`event_json` string,
`server_time` string)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_base_event_log/'
TBLPROPERTIES('parquet.compression'='lzo');
2)编写UDF和UDTF函数(省略)
生成jar包hive-functions-0.0.1-SNAPSHOT.jar
在hive目录下
mkdir auxlib(新建目录)
将hive-functions-0.0.1-SNAPSHOT.jar放到该目录下
创建永久函数与开发好的java class关联:
hive (gmall)>
create function base_analizer as 'com.atguigu.udf.MyUDF';
hive (gmall)>
create function flat_analizer as 'com.atguigu.udtf.MyUDTF';
2.4、DWD层事件表加载数据脚本
1)在hadoop103的/opt/module/myscripts目录下创建脚本
[atguigu@hadoop103 myscripts]$ vim dwd_base_log.sh
在脚本中编写如下内容
#!/bin/bash
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert overwrite TABLE gmall.dwd_base_event_log PARTITION(dt='$do_date')
SELECT
base_analizer(line,'mid') mid_id,
base_analizer(line,'uid') user_id,
base_analizer(line,'vc') version_code,
base_analizer(line,'vn') version_name,
base_analizer(line,'l') lang,
base_analizer(line,'sr') source,
base_analizer(line,'os') os,
base_analizer(line,'ar') area,
base_analizer(line,'md') model,
base_analizer(line,'ba') brand,
base_analizer(line,'sv') sdk_version,
base_analizer(line,'g') gmail,
base_analizer(line,'hw') height_width,
base_analizer(line,'t') app_time,
base_analizer(line,'nw') network,
base_analizer(line,'ln') lng,
base_analizer(line,'la') lat,
en event_name,
ej event_json,
base_analizer(line,'ts') server_time
FROM gmall.ods_event_log
LATERAL VIEW flat_analizer(base_analizer(line,'et')) tmp as en,ej
WHERE dt='$do_date';
"
hive -e "$sql"
2)增加脚本执行权限
[atguigu@hadoop103 myscripts]$ chmod u+x dwd_base_log.sh
3)脚本使用
[atguigu@hadoop103 myscripts]$ dwd_base_log.sh 2021-08-03
2.5、DWD层事件表获取
~1.商品点击表
1)建表语句
hive (gmall)>
drop table if exists dwd_display_log;
CREATE EXTERNAL TABLE dwd_display_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`action` string,
`goodsid` string,
`place` string,
`extend1` string,
`category` string,
`server_time` string
)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_display_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~2.商品详情页表
1)建表语句
hive (gmall)>
drop table if exists dwd_newsdetail_log;
CREATE EXTERNAL TABLE dwd_newsdetail_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`entry` string,
`action` string,
`goodsid` string,
`showtype` string,
`news_staytime` string,
`loading_time` string,
`type1` string,
`category` string,
`server_time` string)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_newsdetail_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~3.商品列表页表
1)建表语句
hive (gmall)>
drop table if exists dwd_loading_log;
CREATE EXTERNAL TABLE dwd_loading_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`action` string,
`loading_time` string,
`loading_way` string,
`extend1` string,
`extend2` string,
`type` string,
`type1` string,
`server_time` string)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_loading_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~4.广告表
1)建表语句
hive (gmall)>
drop table if exists dwd_ad_log;
CREATE EXTERNAL TABLE dwd_ad_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`entry` string,
`action` string,
`content` string,
`detail` string,
`ad_source` string,
`behavior` string,
`newstype` string,
`show_style` string,
`server_time` string)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_ad_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~5.消息通知表
1)建表语句
hive (gmall)>
drop table if exists dwd_notification_log;
CREATE EXTERNAL TABLE dwd_notification_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`action` string,
`noti_type` string,
`ap_time` string,
`content` string,
`server_time` string
)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_notification_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~6.用户前台活跃表
1)建表语句
hive (gmall)>
drop table if exists dwd_active_foreground_log;
CREATE EXTERNAL TABLE dwd_active_foreground_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`push_id` string,
`access` string,
`server_time` string)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_foreground_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~7.用户后台活跃表
1)建表语句
hive (gmall)>
drop table if exists dwd_active_background_log;
CREATE EXTERNAL TABLE dwd_active_background_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`active_source` string,
`server_time` string
)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_background_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~8.评论表
1)建表语句
hive (gmall)>
drop table if exists dwd_comment_log;
CREATE EXTERNAL TABLE dwd_comment_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`comment_id` int,
`userid` int,
`p_comment_id` int,
`content` string,
`addtime` string,
`other_id` int,
`praise_count` int,
`reply_count` int,
`server_time` string
)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_comment_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~9.收藏表
1)建表语句
hive (gmall)>
drop table if exists dwd_favorites_log;
CREATE EXTERNAL TABLE dwd_favorites_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`id` int,
`course_id` int,
`userid` int,
`add_time` string,
`server_time` string
)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_favorites_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~10.点赞表
1)建表语句
hive (gmall)>
drop table if exists dwd_praise_log;
CREATE EXTERNAL TABLE dwd_praise_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`id` string,
`userid` string,
`target_id` string,
`type` string,
`add_time` string,
`server_time` string
)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_praise_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~11.错误日志表
1)建表语句
hive (gmall)>
drop table if exists dwd_error_log;
CREATE EXTERNAL TABLE dwd_error_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`errorBrief` string,
`errorDetail` string,
`server_time` string)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_error_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~12.DWD层事件表加载数据脚本
1)在hadoop103的/opt/module/myscripts目录下创建脚本
[atguigu@hadoop103 myscripts]$ vim dwd_event_log.sh
在脚本中编写如下内容
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert overwrite table dwd_display_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.goodsid') goodsid,
get_json_object(event_json,'$.kv.place') place,
get_json_object(event_json,'$.kv.extend1') extend1,
get_json_object(event_json,'$.kv.category') category,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='display';
insert overwrite table dwd_newsdetail_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.entry') entry,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.goodsid') goodsid,
get_json_object(event_json,'$.kv.showtype') showtype,
get_json_object(event_json,'$.kv.news_staytime') news_staytime,
get_json_object(event_json,'$.kv.loading_time') loading_time,
get_json_object(event_json,'$.kv.type1') type1,
get_json_object(event_json,'$.kv.category') category,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='newsdetail';
insert overwrite table dwd_loading_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.loading_time') loading_time,
get_json_object(event_json,'$.kv.loading_way') loading_way,
get_json_object(event_json,'$.kv.extend1') extend1,
get_json_object(event_json,'$.kv.extend2') extend2,
get_json_object(event_json,'$.kv.type') type,
get_json_object(event_json,'$.kv.type1') type1,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='loading';
insert overwrite table dwd_ad_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.entry') entry,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.content') content,
get_json_object(event_json,'$.kv.detail') detail,
get_json_object(event_json,'$.kv.source') ad_source,
get_json_object(event_json,'$.kv.behavior') behavior,
get_json_object(event_json,'$.kv.newstype') newstype,
get_json_object(event_json,'$.kv.show_style') show_style,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='ad';
insert overwrite table dwd_notification_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.noti_type') noti_type,
get_json_object(event_json,'$.kv.ap_time') ap_time,
get_json_object(event_json,'$.kv.content') content,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='notification';
insert overwrite table dwd_active_foreground_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.push_id') push_id,
get_json_object(event_json,'$.kv.access') access,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='active_foreground';
insert overwrite table dwd_active_background_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.active_source') active_source,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='active_background';
insert overwrite table dwd_comment_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.comment_id') comment_id,
get_json_object(event_json,'$.kv.userid') userid,
get_json_object(event_json,'$.kv.p_comment_id') p_comment_id,
get_json_object(event_json,'$.kv.content') content,
get_json_object(event_json,'$.kv.addtime') addtime,
get_json_object(event_json,'$.kv.other_id') other_id,
get_json_object(event_json,'$.kv.praise_count') praise_count,
get_json_object(event_json,'$.kv.reply_count') reply_count,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='comment';
insert overwrite table dwd_favorites_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.id') id,
get_json_object(event_json,'$.kv.course_id') course_id,
get_json_object(event_json,'$.kv.userid') userid,
get_json_object(event_json,'$.kv.add_time') add_time,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='favorites';
insert overwrite table dwd_praise_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.id') id,
get_json_object(event_json,'$.kv.userid') userid,
get_json_object(event_json,'$.kv.target_id') target_id,
get_json_object(event_json,'$.kv.type') type,
get_json_object(event_json,'$.kv.add_time') add_time,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='praise';
insert overwrite table dwd_error_log
PARTITION (dt='$do_date')
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.errorBrief') errorBrief,
get_json_object(event_json,'$.kv.errorDetail') errorDetail,
server_time
from dwd_base_event_log
where dt='$do_date' and event_name='error';
"
hive -e "$sql"
2)增加脚本执行权限
[atguigu@hadoop103 myscripts]$ chmod u+x dwd_event_log.sh
3)脚本使用
[atguigu@hadoop103 myscripts]$ dwd_event_log.sh 2021-08-03
3、业务知识准备
3.1、业务术语
- 用户
用户以设备为判断标准,在移动统计中,每个独立设备认为是一个独立用户。Android系统根据IMEI号,IOS系统根据OpenUDID来标识一个独立用户,每部手机一个用户。 - 新增用户
首次联网使用应用的用户。如果一个用户首次打开某APP,那这个用户定义为新增用户;卸载再安装的设备,不会被算作一次新增。新增用户包括日新增用户、周新增用户、月新增用户。 - 活跃用户
打开应用的用户即为活跃用户,不考虑用户的使用情况。每天一台设备打开多次会被计为一个活跃用户。 - 周(月)活跃用户
某个自然周(月)内启动过应用的用户,该周(月)内的多次启动只记一个活跃用户。 - 月活跃率
月活跃用户与截止到该月累计的用户总和之间的比例。 - 沉默用户
用户仅在安装当天(次日)启动一次,后续时间无再启动行为。该指标可以反映新增用户质量和用户与APP的匹配程度。 - 版本分布
不同版本的周内各天新增用户数,活跃用户数和启动次数。利于判断APP各个版本之间的优劣和用户行为习惯。 - 本周回流用户
上周未启动过应用,本周启动了应用的用户。 - 连续n周活跃用户
连续n周,每周至少启动一次。 - 忠诚用户
连续活跃5周以上的用户 - 连续活跃用户
连续2周及以上活跃的用户 - 近期流失用户
连续n(2<= n <= 4)周没有启动应用的用户。(第n+1周没有启动过) - 留存用户
某段时间内的新增用户,经过一段时间后,仍然使用应用的被认作是留存用户;这部分用户占当时新增用户的比例即是留存率。
例如,5月份新增用户200,这200人在6月份启动过应用的有100人,7月份启动过应用的有80人,8月份启动过应用的有50人;则5月份新增用户一个月后的留存率是50%,二个月后的留存率是40%,三个月后的留存率是25%。 - 用户新鲜度
每天启动应用的新老用户比例,即新增用户数占活跃用户数的比例。 - 单次使用时长
每次启动使用的时间长度。 - 日使用时长
累计一天内的使用时间长度。 - 启动次数计算标准
IOS平台应用退到后台就算一次独立的启动;Android平台我们规定,两次启动之间的间隔小于30秒,被计算一次启动。用户在使用过程中,若因收发短信或接电话等退出应用30秒又再次返回应用中,那这两次行为应该是延续而非独立的,所以可以被算作一次使用行为,即一次启动。业内大多使用30秒这个标准,但用户还是可以自定义此时间间隔。
3.2、系统函数
~1.collect_set函数
把一列多行转为一列一行
1)创建原数据表
hive (gmall)> drop table if exists stud; create table stud (name
string, area string, course string, score int);
2)向原数据表中插入数据
hive (gmall)> insert into table stud values(‘zhang3’,‘bj’,‘math’,88);
insert into table stud values(‘li4’,‘bj’,‘math’,99); insert into table
stud values(‘wang5’,‘sh’,‘chinese’,92); insert into table stud
values(‘zhao6’,‘sh’,‘chinese’,54); insert into table stud
values(‘tian7’,‘bj’,‘chinese’,91);
3)查询表中数据
hive (gmall)> select * from stud; stud.name stud.area
stud.course stud.score zhang3 bj math 88 li4 bj
math 99 wang5 sh chinese 92 zhao6 sh chinese 54 tian7
bj chinese 91
4)把同一分组的不同行的数据聚合成一个集合
hive (gmall)> select course, collect_set(area), avg(score) from stud
group by course; chinese [“sh”,“bj”] 79.0 math [“bj”] 93.5
5) 用下标可以取某一个
hive (gmall)> select course, collect_set(area)[0], avg(score) from
stud group by course; chinese sh 79.0 math bj 93.5
~2.日期处理函数(datediff)
1)date_format函数(根据格式整理日期)
hive (gmall)> select date_format(‘2019-02-12’,‘yyyy-MM’); 2019-02
2)date_add函数(加减日期)
hive (gmall)> select date_add(‘2019-02-12’,-1); 2019-02-11 hive
(gmall)> select date_add(‘2018-02-12’,1); 2019-02-13
3)next_day函数
(1)取当前天的下一个周一 hive (gmall)> select next_day(‘2019-02-12’,‘MO’);
2019-02-18
说明:星期一到星期日的英文(Monday,Tuesday、Wednesday、Thursday、Friday、Saturday、Sunday)
(2)取当前周的周一 hive (gmall)> select
date_add(next_day(‘2019-02-12’,‘MO’),-7); 2019-02-11
4)last_day函数(求当月最后一天日期)
hive (gmall)> select last_day(‘2019-02-12’); 2019-02-28
4、需求一:用户活跃主题
4.1、DWS层
~1.每日活跃设备明细
1)建表语句
hive (gmall)>
drop table if exists dws_uv_detail_day;
create external table dws_uv_detail_day
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
)
partitioned by(dt string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_day';
~2.每周活跃设备明细
1)建表语句
hive (gmall)>
drop table if exists dws_uv_detail_wk;
create external table dws_uv_detail_wk(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`monday_date` string COMMENT '周一日期',
`sunday_date` string COMMENT '周日日期'
) COMMENT '活跃用户按周明细'
PARTITIONED BY (`wk_dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_wk/';
~3.每月活跃设备明细
1)建表语句
hive (gmall)>
drop table if exists dws_uv_detail_mn;
create external table dws_uv_detail_mn(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按月明细'
PARTITIONED BY (`mn` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_mn/';
4.DWS层加载数据脚本
1)在hadoop103的/opt/module/myscripts目录下创建脚本
[atguigu@hadoop103 myscripts]$ vim dws_uv_log.sh
在脚本中编写如下内容
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
use gmall;
insert overwrite table gmall.dws_uv_detail_day PARTITION(dt='$do_date')
SELECT
mid_id,
concat_ws('|',collect_set(user_id)) user_id,
concat_ws('|',collect_set(version_code)) version_code,
concat_ws('|',collect_set(version_name)) version_name,
concat_ws('|',collect_set(lang)) lang,
concat_ws('|',collect_set(source)) source,
concat_ws('|',collect_set(os)) os,
concat_ws('|',collect_set(area)) area,
concat_ws('|',collect_set(model)) model,
concat_ws('|',collect_set(brand)) brand,
concat_ws('|',collect_set(sdk_version)) sdk_version,
concat_ws('|',collect_set(gmail)) gmail,
concat_ws('|',collect_set(height_width)) height_width,
concat_ws('|',collect_set(app_time)) app_time ,
concat_ws('|',collect_set(network)) network,
concat_ws('|',collect_set(lng)) lng,
concat_ws('|',collect_set(lat)) lat
FROM gmall.dwd_start_log
WHERE dt='$do_date'
group by mid_id;
insert overwrite table gmall.dws_uv_detail_wk PARTITION(wk_dt)
SELECT
mid_id,
concat_ws('|',collect_set(user_id)) user_id,
concat_ws('|',collect_set(version_code)) version_code,
concat_ws('|',collect_set(version_name)) version_name,
concat_ws('|',collect_set(lang)) lang,
concat_ws('|',collect_set(source)) source,
concat_ws('|',collect_set(os)) os,
concat_ws('|',collect_set(area)) area,
concat_ws('|',collect_set(model)) model,
concat_ws('|',collect_set(brand)) brand,
concat_ws('|',collect_set(sdk_version)) sdk_version,
concat_ws('|',collect_set(gmail)) gmail,
concat_ws('|',collect_set(height_width)) height_width,
concat_ws('|',collect_set(app_time)) app_time ,
concat_ws('|',collect_set(network)) network,
concat_ws('|',collect_set(lng)) lng,
concat_ws('|',collect_set(lat)) lat,
date_sub(next_day('$do_date','mo'),7) monday_date ,
date_sub(next_day('$do_date','mo'),1) sunday_date,
concat(date_sub(next_day('$do_date','mo'),7),'-',date_sub(next_day('$do_date','mo'),1))
FROM gmall.dws_uv_detail_day
WHERE dt BETWEEN date_sub(next_day('$do_date','mo'),7)
and date_sub(next_day('$do_date','mo'),1)
group by mid_id;
insert overwrite table gmall.dws_uv_detail_mn PARTITION(mn)
SELECT
mid_id,
concat_ws('|',collect_set(user_id)) user_id,
concat_ws('|',collect_set(version_code)) version_code,
concat_ws('|',collect_set(version_name)) version_name,
concat_ws('|',collect_set(lang)) lang,
concat_ws('|',collect_set(source)) source,
concat_ws('|',collect_set(os)) os,
concat_ws('|',collect_set(area)) area,
concat_ws('|',collect_set(model)) model,
concat_ws('|',collect_set(brand)) brand,
concat_ws('|',collect_set(sdk_version)) sdk_version,
concat_ws('|',collect_set(gmail)) gmail,
concat_ws('|',collect_set(height_width)) height_width,
concat_ws('|',collect_set(app_time)) app_time ,
concat_ws('|',collect_set(network)) network,
concat_ws('|',collect_set(lng)) lng,
concat_ws('|',collect_set(lat)) lat,
date_format('$do_date','yyyy-MM')
FROM gmall.dws_uv_detail_day
where date_format('$do_date','yyyy-MM')=date_format(dt,'yyyy-MM')
group by mid_id;
"
hive -e "$sql"
2)增加脚本执行权限
[atguigu@hadoop103 myscripts]$ chmod u+x dws_uv_log.sh
3)脚本使用
[atguigu@hadoop103 myscripts]$ dws_uv_log.sh 2021-08-03
4.2、ADS层
~1.活跃设备数
1)建表语句
hive (gmall)>
drop table if exists ads_uv_count;
create external table ads_uv_count(
`dt` string COMMENT '统计日期',
`day_count` bigint COMMENT '当日用户数量',
`wk_count` bigint COMMENT '当周用户数量',
`mn_count` bigint COMMENT '当月用户数量',
`is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
`is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果'
) COMMENT '活跃设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_uv_count/';
~2.ADS层加载数据脚本
1)在hadoop103的/opt/module/myscripts目录下创建脚本
[atguigu@hadoop103 myscripts]$ vim ads_uv_log.sh
在脚本中编写如下内容
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
INSERT into table gmall.ads_uv_count
SELECT
'$do_date' dt,
day_count,
wk_count,
mn_count,
if(date_sub(next_day('$do_date','mo'),1)='$do_date','Y','N') is_weekend,
if(last_day('$do_date')='$do_date','Y','N') is_monthend
FROM
(select COUNT(mid_id) day_count from gmall.dws_uv_detail_day where dt='$do_date') t1
JOIN
(select COUNT(mid_id) wk_count from gmall.dws_uv_detail_wk where wk_dt=concat(date_sub(next_day('$do_date','mo'),7),'-',date_sub(next_day('$do_date','mo'),1))) t2
join
(select COUNT(mid_id) mn_count from gmall.dws_uv_detail_mn
where mn=date_format('$do_date','yyyy-MM')) t3
"
hive -e "$sql"
2)增加脚本执行权限
[atguigu@hadoop103 myscripts]$ chmod u+x ads_uv_log.sh
3)脚本使用
[atguigu@hadoop103 myscripts]$ ads_uv_log.sh 2021-08-03
5、需求二:用户新增主题
5.1、DWS层(每日新增设备明细表)
1)建表语句
hive (gmall)>
drop table if exists dws_new_mid_day;
create external table dws_new_mid_day
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`create_date` string comment '创建时间'
) COMMENT '每日新增设备信息'
stored as parquet
location '/warehouse/gmall/dws/dws_new_mid_day/';
2)导入数据脚本
[atguigu@hadoop103 myscripts]$ vim dws_new_log.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table gmall.dws_new_mid_day
SELECT
t1.*
FROM
(select * from dws_uv_detail_day where dt='$do_date') t1
LEFT JOIN gmall.dws_new_mid_day nm
on t1.mid_id=nm.mid_id
WHERE nm.mid_id is null;
"
hive -e "$sql"
2.2、ADS层(每日新增设备表)
1)建表语句
hive (gmall)>
drop table if exists ads_new_mid_count;
create external table ads_new_mid_count
(
`create_date` string comment '创建时间' ,
`new_mid_count` BIGINT comment '新增设备数量'
) COMMENT '每日新增设备信息数量'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_new_mid_count/';
2)导入数据脚本
[atguigu@hadoop103 myscripts]$ vim ads_new_log.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table ads_new_mid_count
SELECT
'$do_date',
count(*)
FROM
dws_new_mid_day
where create_date='$do_date'
"
hive -e "$sql"
3)使用脚本
[atguigu@hadoop103 myscripts]$ ads_new_log.sh 2021-08-05
6、需求三:用户留存主题
6.1、DWS层
~1.DWS层(每日留存用户明细表)
1)建表语句
hive (gmall)>
drop table if exists dws_user_retention_day;
create external table dws_user_retention_day
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`create_date` string comment '设备新增时间',
`retention_day` int comment '截止当前日期留存天数'
) COMMENT '每日用户留存情况'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_retention_day/';
2)导入数据(每天计算前1天的新用户访问留存明细)
hive (gmall)>
insert overwrite table dws_user_retention_day PARTITION(dt='2021-08-05')
SELECT
t1.mid_id,
t1.user_id,
t1.version_code,
t1.version_name,
t1.lang,
t1.source,
t1.os,
t1.area,
t1.model,
t1.brand,
t1.sdk_version,
t1.gmail,
t1.height_width,
t1.app_time,
t1.network,
t1.lng,
t1.lat,
t2.create_date,
1 retention_day
FROM
(SELECT * from gmall.dws_uv_detail_day where dt='2021-08-05') t1
JOIN
(select mid_id,create_date from gmall.dws_new_mid_day where create_date=date_sub('2021-08-05',1)) t2
on t1.mid_id=t2.mid_id
~ 2.DWS层(1,2,3,n天留存用户明细表)
1)导入数据(每天计算前1,2,3,n天的新用户访问留存明细)
hive (gmall)>
insert overwrite table dws_user_retention_day PARTITION(dt='2021-08-05')
SELECT
t1.mid_id,
t1.user_id,
t1.version_code,
t1.version_name,
t1.lang,
t1.source,
t1.os,
t1.area,
t1.model,
t1.brand,
t1.sdk_version,
t1.gmail,
t1.height_width,
t1.app_time,
t1.network,
t1.lng,
t1.lat,
t2.create_date,
1 retention_day
FROM
(SELECT * from gmall.dws_uv_detail_day where dt='2021-08-05') t1
JOIN
(select mid_id,create_date from gmall.dws_new_mid_day where create_date=date_sub('2021-08-05',1)) t2
on t1.mid_id=t2.mid_id
UNION ALL
SELECT
t1.mid_id,
t1.user_id,
t1.version_code,
t1.version_name,
t1.lang,
t1.source,
t1.os,
t1.area,
t1.model,
t1.brand,
t1.sdk_version,
t1.gmail,
t1.height_width,
t1.app_time,
t1.network,
t1.lng,
t1.lat,
t2.create_date,
2 retention_day
FROM
(SELECT * from gmall.dws_uv_detail_day where dt='2021-08-05') t1
JOIN
(select mid_id,create_date from gmall.dws_new_mid_day where create_date=date_sub('2021-08-05',2)) t2
on t1.mid_id=t2.mid_id
UNION ALL
SELECT
t1.mid_id,
t1.user_id,
t1.version_code,
t1.version_name,
t1.lang,
t1.source,
t1.os,
t1.area,
t1.model,
t1.brand,
t1.sdk_version,
t1.gmail,
t1.height_width,
t1.app_time,
t1.network,
t1.lng,
t1.lat,
t2.create_date,
3 retention_day
FROM
(SELECT * from gmall.dws_uv_detail_day where dt='2021-08-05') t1
JOIN
(select mid_id,create_date from gmall.dws_new_mid_day where create_date=date_sub('2021-08-05',3)) t2
on t1.mid_id=t2.mid_id
注意
(1)union会将联合的结果集去重,效率较union all差
(2)union all 不会 对结果集去重,所以 效率高
2)导入数据脚本
[atguigu@hadoop103 myscripts]$ vim dws_retation_day.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
SELECT
t1.mid_id,
t1.user_id,
t1.version_code,
t1.version_name,
t1.lang,
t1.source,
t1.model,
t1.brand,
t1.sdk_version,
t1.gmail,
t1.height_width,
t1.app_time,
t1.network,
t2.create_date,
1 retention_day
JOIN
on t1.mid_id=t2.mid_id
UNION ALL
SELECT
t1.mid_id,
t1.user_id,
t1.version_code,
t1.version_name,
t1.lang,
t1.source,
t1.os,
t1.area,
t1.model,
t1.brand,
t1.sdk_version,
t1.gmail,
t1.height_width,
t1.app_time,
t1.network,
t2.create_date,
2 retention_day
FROM
(SELECT * from gmall.dws_uv_detail_day where dt='$do_date') t1
JOIN
(select mid_id,create_date from gmall.dws_new_mid_day where create_date=date_sub('$do_date',2)) t2
on t1.mid_id=t2.mid_id
UNION ALL
SELECT
t1.mid_id,
t1.user_id,
t1.version_code,
t1.version_name,
t1.lang,
t1.source,
t1.os,
t1.area,
t1.model,
t1.brand,
t1.sdk_version,
t1.gmail,
t1.height_width,
t1.app_time,
t1.network,
t1.lng,
t1.lat,
t2.create_date,
3 retention_day
FROM
(SELECT * from gmall.dws_uv_detail_day where dt='$do_date') t1
JOIN
(select mid_id,create_date from gmall.dws_new_mid_day where create_date=date_sub('$do_date',3)) t2
on t1.mid_id=t2.mid_id
"
hive -e "$sql"
3)使用脚本
[atguigu@hadoop103 myscripts]$ dws_retation_day.sh 2021-08-05
6.2、ADS层
~1.留存用户数
1)建表语句
hive (gmall)>
drop table if exists ads_user_retention_day_count;
create external table ads_user_retention_day_count
(
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量'
) COMMENT '每日用户留存情况'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_count/';
2)导入数据
hive (gmall)>
insert into table gmall.ads_user_retention_day_count
SELECT
'2021-08-03',
retention_day,
count(*)
FROM gmall.dws_user_retention_day
where create_date='2021-08-03'
group by retention_day ;
~2.留存用户比率
1)建表语句
hive (gmall)>
drop table if exists ads_user_retention_day_rate;
create external table ads_user_retention_day_rate
(
`stat_date` string comment '统计日期',
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量',
`new_mid_count` bigint comment '当日设备新增数量',
`retention_ratio` decimal(10,2) comment '留存率'
) COMMENT '每日用户留存情况'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_rate/';
2)导入数据
hive (gmall)>
insert into table ads_user_retention_day_rate
SELECT
'2021-08-05',
ur.create_date,
ur.retention_day,
ur.retention_count,
nm.new_mid_count,
cast (ur.retention_count / nm.new_mid_count as decimal(10,2))
FROM
ads_user_retention_day_count ur
join
ads_new_mid_count nm
on ur.create_date =nm.create_date
where date_add(ur.create_date,ur.retention_day)='2021-08-05';
7、需求四:沉默用户数
只在安装当天启动过,且启动时间是在一周前
7.1、DWS层
使用日活明细表dws_uv_detail_day作为DWS层数据
7.2、ADS层
1)建表语句
hive (gmall)>
drop table if exists ads_silent_count;
create external table ads_silent_count(
`dt` string COMMENT '统计日期',
`silent_count` bigint COMMENT '沉默设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_silent_count';
2)导入数据
hive (gmall)>
--having 只能写聚集函数,和group by 后的字段
insert into table gmall.ads_silent_count
select
'2021-08-09',
count(*)
from
(SELECT
mid_id
FROM gmall.dws_uv_detail_day
where dt<='2021-08-09'
group by mid_id
having count(mid_id)=1 and min(dt)<date_sub('2021-08-09',7)) tmp
7.3、编写脚本
[atguigu@hadoop103 myscripts]$ vim ads_silent_log.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table gmall.ads_silent_count
select
'$do_date',
count(*)
from
(SELECT
mid_id
FROM gmall.dws_uv_detail_day
where dt<='$do_date'
group by mid_id
having count(mid_id)=1 and min(dt)<date_sub('$do_date',7)) tmp
"
hive -e "$sql"
8、需求五:本周回流用户数
本周回流=本周活跃-本周新增-上周活跃
8.1、DWS层
使用日活明细表dws_uv_detail_day作为DWS层数据
8.2、ADS层
1)建表语句
hive (gmall)>
drop table if exists ads_back_count;
create external table ads_back_count(
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '统计日期所在周',
`wastage_count` bigint COMMENT '回流设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_back_count';
8.3、编写脚本
[atguigu@hadoop103 myscripts]$ vim ads_back_log.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
with t1 as
(select
mid_id
from dws_uv_detail_wk
where wk_dt =concat(date_sub(next_day('$do_date','mo'),7),'-',date_sub(next_day('$do_date','mo'),1))),
t2 as
(select
mid_id
from dws_new_mid_day
where create_date BETWEEN date_sub(next_day('$do_date','mo'),7) and '$do_date'),
t3 as
(select
mid_id
from dws_uv_detail_wk
where wk_dt =concat(date_sub(next_day('$do_date','mo'),14),'-',date_sub(next_day('$do_date','mo'),8)))
insert into table gmall.ads_back_count
select
'$do_date',
concat(date_sub(next_day('$do_date','mo'),7),'-',date_sub(next_day('$do_date','mo'),1)),
count(*)
from
t1 left join t2 on t1.mid_id=t2.mid_id
left join t3 on t1.mid_id=t3.mid_id
where t2.mid_id is null and t3.mid_id is null
"
hive -e "$sql"
9、需求六:流失用户数
流失用户:最近7天未登录我们称之为流失用户
9.1、DWS层
使用日活明细表dws_uv_detail_day作为DWS层数据
9.2、ADS层
1)建表语句
hive (gmall)>
drop table if exists ads_wastage_count;
create external table ads_wastage_count(
`dt` string COMMENT '统计日期',
`wastage_count` bigint COMMENT '流失设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_wastage_count';
9.3、编写脚本
[atguigu@hadoop103 myscripts]$ vim ads_wastage_log.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table gmall.ads_wastage_count
select
'$do_date',
count(*)
from
(select
mid_id
from gmall.dws_uv_detail_day
where dt<='$do_date'
group by mid_id
having max(dt) < date_sub('$do_date',7)) tmp
"
hive -e "$sql"
10、需求七:最近连续三周活跃用户数
最近3周连续活跃的用户:通常是周一对前3周的数据做统计,该数据一周计算一次。
10.1、DWS层
使用周活明细表dws_uv_detail_wk作为DWS层数据
10.2、ADS 层
1)建表语句
hive (gmall)>
drop table if exists ads_continuity_wk_count;
create external table ads_continuity_wk_count(
`dt` string COMMENT '统计日期,一般用结束周周日日期,如果每天计算一次,可用当天日期',
`wk_dt` string COMMENT '持续时间',
`continuity_count` bigint
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_wk_count';
11、需求八:最近七天内连续三天活跃用户数
11.1、DWS层
使用日活明细表dws_uv_detail_day作为DWS层数据
11.2、ADS 层
1)建表语句
hive (gmall)>
drop table if exists ads_continuity_uv_count;
create external table ads_continuity_uv_count(
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '最近7天日期',
`continuity_count` bigint
) COMMENT '连续活跃设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_uv_count';
11.3、编写脚本
[atguigu@hadoop103 myscripts]$ vim ads_continuity_log.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table ads_continuity_uv_count
select
'$do_date',
concat(date_sub('$do_date',7),'-','$do_date' ),
count(DISTINCT mid_id)
from
(select
mid_id
from
(SELECT
mid_id,dt,ROW_NUMBER() OVER(PARTITION by mid_id order by dt )rn,
date_sub(dt,ROW_NUMBER() OVER(PARTITION by mid_id order by dt))diff
from dws_uv_detail_day
where dt BETWEEN date_sub('$do_date',7) and '$do_date') tmp
group by mid_id,diff
having count(*)>=3) tmp2
"
hive -e "$sql"
12、需求九:每个用户累计访问次数
12.1、DWS层
~1.建表语句
hive (gmall)>
drop table if exists dws_user_total_count_day;
create external table dws_user_total_count_day(
`mid_id` string COMMENT '设备id',
`subtotal` bigint COMMENT '每日登录小计'
)
partitioned by(`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dws/dws_user_total_count_day';
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim dws_user_total_count_day.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert overwrite table dws_user_total_count_day PARTITION(dt='$do_date')
SELECT
mid_id,
count(*) subtotal
FROM dwd_start_log
where dt='$do_date'
group by mid_id
;
"
hive -e "$sql"
12.2、ADS 层
~1.建表语句
hive (gmall)>
drop table if exists ads_user_total_count;
create external table ads_user_total_count(
`mid_id` string COMMENT '设备id',
`subtotal` bigint COMMENT '每日登录小计',
`total` bigint COMMENT '登录次数总计'
)
partitioned by(`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_total_count';
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim ads_user_total_count.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert overwrite table gmall.ads_user_total_count PARTITION(dt='$do_date')
select
t1.mid_id,
t1.subtotal,
t2.total
FROM
(select mid_id,subtotal
from gmall.dws_user_total_count_day
where dt='$do_date') t1
join
(select mid_id,sum(subtotal) total
from gmall.dws_user_total_count_day
where dt<='$do_date'
group by mid_id) t2
on t1.mid_id=t2.mid_id
"
hive -e "$sql"
13、需求十:新收藏用户数
新收藏用户:指的是在某天首次添加收藏的用户
13.1、DWS层建立用户日志行为宽表
~1.建表语句
考虑到后面的多个需求会同时用到多张表中的数据, 如果每次都join操作, 则影响查询的效率. 可以先提前做一张宽表, 提高其他查询的执行效率.
每个用户对每个商品的点击次数, 点赞次数, 收藏次数
hive (gmall)>
drop table if exists dws_user_action_wide_log;
CREATE EXTERNAL TABLE dws_user_action_wide_log(
`mid_id` string COMMENT '设备id',
`goodsid` string COMMENT '商品id',
`display_count` string COMMENT '点击次数',
`praise_count` string COMMENT '点赞次数',
`favorite_count` string COMMENT '收藏次数')
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action_wide_log/'
TBLPROPERTIES('parquet.compression'='lzo');
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim dws_user_action_wide_log.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert overwrite table gmall.dws_user_action_wide_log PARTITION(dt='$do_date')
select
mid_id,
goodsid,
sum(display_count),
sum(praise_count),
sum(favorite_count)
from
(select
mid_id,goodsid ,count(*) display_count,0 praise_count,0 favorite_count
from gmall.dwd_display_log
where dt='$do_date'
group by mid_id ,goodsid
union all
select
mid_id,target_id ,0 display_count,count(*) praise_count,0 favorite_count
FROM gmall.dwd_praise_log
where dt='$do_date'
group by mid_id,target_id
union all
select
mid_id,course_id ,0 display_count,0 praise_count,count(*) favorite_count
FROM gmall.dwd_favorites_log
where dt='$do_date'
group by mid_id,course_id) tmp
group by mid_id,goodsid
"
hive -e "$sql"
13.2、DWS层
使用日志数据用户行为宽表作为DWS层表
13.3、ADS 层
~1.建表语句
drop table if exists ads_new_favorites_mid_day;
create external table ads_new_favorites_mid_day(
`dt` string COMMENT '日期',
`favorites_users` bigint COMMENT '新收藏用户数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_new_favorites_mid_day';
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim ads_new_favorites_mid_day.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table ads_new_favorites_mid_day
select
'$do_date',
count(*)
from
(select
mid_id
from dws_user_action_wide_log
where favorite_count >0
group by mid_id
having min(dt)='$do_date') tmp
"
hive -e "$sql"
14、需求十一:各个商品总点击次数top3的用户
14.1、DWS层
使用日志数据用户行为宽表作为DWS层表
14.2、ADS 层
~1.建表语句
drop table if exists ads_goods_count;
create external table ads_goods_count(
`dt` string COMMENT '统计日期',
`goodsid` string COMMENT '商品',
`user_id` string COMMENT '用户',
`goodsid_user_count` bigint COMMENT '商品用户点击次数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_goods_count';
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim ads_goods_count.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table ads_goods_count
select
'2020-02-17', goodsid,mid_id,totalCount
from
(select goodsid,mid_id,totalCount,
RANK() over(PARTITION by goodsid order by totalCount desc )rn
from
(SELECT
goodsid,mid_id,sum(display_count) totalCount
from dws_user_action_wide_log
where dt<='2020-02-17' and display_count>0
GROUP by goodsid,mid_id) t1) t2
where rn<=3
"
hive -e "$sql"
15、需求十二:统计每日各类别下点击次数top10的商品
15.1、DWS层
使用点击日志表作为DWS层数据源
15.2、ADS 层
~1.建表语句
drop table if exists ads_goods_display_top10;
create external table ads_goods_display_top10 (
`dt` string COMMENT '日期',
`category` string COMMENT '品类',
`goodsid` string COMMENT '商品id',
`goods_count` string COMMENT '商品点击次数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_goods_display_top10';
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim ads_goods_display_top10.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table ads_goods_display_top10
select
'2020-02-17',category,goodsid,goods_count
from
(select
category,goodsid,goods_count,
RANK() over(PARTITION by category order by goods_count desc)rn
from
(select
category,goodsid,count(*) goods_count
from dwd_display_log
where dt='2020-02-17'
group by category,goodsid) t1) t2
where rn <=10
"
hive -e "$sql"
16、需求十三:总点击次数最多的10个用户点击的各个商品次数
16.1、DWS层
使用日志数据用户行为宽表作为DWS层表
16.2、ADS 层
~1.建表语句
drop table if exists ads_goods_user_count;
create external table ads_goods_user_count(
`dt` string COMMENT '统计日期',
`mid_id` string COMMENT '用户id',
`u_ct` string COMMENT '用户总点击次数',
`goodsid` string COMMENT '商品id',
`d_ct` string COMMENT '各个商品点击次数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_goods_user_count';
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim ads_goods_user_count.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into TABLE ads_goods_user_count
select
'2020-02-17',t1.mid_id,u_ct,goodsid,d_ct
from
(select
mid_id,sum(display_count) u_ct
from dws_user_action_wide_log
where dt<='2020-02-17'
GROUP by mid_id
order by u_ct desc
limit 10) t1
join
(select
mid_id,goodsid,sum(display_count) d_ct
from dws_user_action_wide_log
where dt<='2020-02-17'
group by mid_id,goodsid) t2
on t1.mid_id=t2.mid_id
where d_ct>0
"
hive -e "$sql"
17、需求十四:月活跃率
17.1、DWS层
使用DWS层月活表以及ADS新增用户表作为DWS层
17.2、ADS 层
~1.建表语句
drop table if exists ads_mn_ratio_count;
create external table ads_mn_ratio_count(
`dt` string COMMENT '统计日期',
`mn` string COMMENT '统计月活跃率的月份',
`ratio` string COMMENT '活跃率'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_mn_ratio_count';
~2.数据导入脚本
[atguigu@hadoop103 myscripts]$ vim ads_mn_ratio_count.sh
#!/bin/bash
#向ods的两个表中导入每天的数据,为数据创建LZO索引
#接受要导入数据的日期参数,-n可以判断后面的参数是否为null,如果为null,返回true,否则返回false
if [ -n "$1" ]
then
#默认日期为昨天的
do_date=$1
else
do_date=$(date -d yesterday +%F)
fi
echo ===日志日期为 $do_date===
sql="
use gmall;
insert into table ads_mn_ratio_count
select
'2020-02-17',
date_format('2020-02-17','yyyy-MM'),
cast(mn_count/totalCount * 100 as decimal(10,2))
from
(SELECT
mn_count
from ads_uv_count
where dt='2020-02-17' ) t1
join
(SELECT
sum(new_mid_count) totalCount
from ads_new_mid_count
where create_date <= '2020-02-17') t2
"
hive -e "$sql"