经典sql：统计连续登陆的用户数(用hive/mysql均可)[含建表语句]

本文链接：https://blog.youkuaiyun.com/weixin_46905895/article/details/115639961

首先感谢以下作者的著作：https://blog.youkuaiyun.com/qq_36039236/article/details/108219895
https://zhuanlan.zhihu.com/p/99139680
https://zhuanlan.zhihu.com/p/373033198

排版可能是不太好，因为我先记在我的语雀笔记里的，但想到也是优快云教我这么多，回馈优快云，因此二次发在这里。

我这边对语句进行了更新，话不多说，开始吧。

1.建表语句
• 源数据，文件中是以,号隔开的
• 这里数据重复了，但不要去重哈

id,date
A,2018-09-04
B,2018-09-04
C,2018-09-04
A,2018-09-05
A,2018-09-05
C,2018-09-05
A,2018-09-06
B,2018-09-06
C,2018-09-06
A,2018-09-04
B,2018-09-04
C,2018-09-04
A,2018-09-05
A,2018-09-05
C,2018-09-05
A,2018-09-06
B,2018-09-06
C,2018-09-06

• 插入语句

insert into table tb_use values
 ("A","2018-09-04")
,("B","2018-09-04")
,("C","2018-09-04")
,("A","2018-09-05")
,("A","2018-09-05")
,("C","2018-09-05")
,("A","2018-09-06")
,("B","2018-09-06")
,("C","2018-09-06")
,("A","2018-09-04")
,("B","2018-09-04")
,("C","2018-09-04")
,("A","2018-09-05")
,("A","2018-09-05")
,("C","2018-09-05")
,("A","2018-09-06")
,("B","2018-09-06")
,("C","2018-09-06");

• 设置本地模式

set hive.exec.mode.local.auto=true;

• 在hive中创建表

drop table tb_use;

create table tb_use(
id string,
date string
)
-- partitioned by (daystr string)              -- 指定分区，必须在最前面（先指定存入的分区才能指定其他）
row format delimited fields terminated by ','  -- 指定行的分隔符
lines terminated by '\n'                       -- 指定列的分隔符，默认为'\n'
stored as textfile                             -- 指定存储文件的类型，hive中默认类型为textfile,SequenceFile,RCFile,ORCFile
-- location 'hdfs_path';                       -- 存储到HDFS路径

-- 【我使用的】
create table tb_use(
id string,
datee string
)row format delimited fields terminated by ','
lines terminated by '\n'  
stored as textfile ;

;
• 从本地加载数据到表中

load data local inpath '/export/data/test.txt' overwrite into table tb_use;

2.基础知识
• 将日期格式的字符串转为日期格式

select cast('2018-09-05' as date) ;
-- 结果：
2018-09-05

• 日期增加函数：date_add(string startdate, int days)

select date_add('2016-12-08',10);
-- 结果：
2016-12-18

• 日期减少函数：date_sub (string startdate, int days)

select date_sub('2016-12-08',10);
-- 结果：
2016-11-28

• lead函数（第三个参数是当查不到记录的时候显示-1，默认是null）

select id ,datee,
lead(datee,1) over(partition by id order by datee desc ) as date1
from tb_use ;

在这里插入图片描述

select id ,datee,
lead(datee,1,-1) over(partition by id order by datee desc ) as date1
from tb_use
group by id,datee ;

在这里插入图片描述

3.解法1：lead搭配date_sub函数
• 展现连续登陆两天的用户信息（用窗口函数分析方法）

select
distinct b.id as c1
from
(
select id ,datee,
lead(datee,1,-1) over(partition by id order by datee desc )  as date1
from tb_use as a
group by id,datee
) as b
where date_sub(cast(b.datee as date),1)=cast(b.date1 as date);

在这里插入图片描述

• 展现连续登陆两天的用户id（用窗口函数分析方法）

-- hive版
select
count(distinct b.id) as c1
from
(
select id ,datee,
lead(datee,1,-1) over(partition by id order by datee desc )  as date1
from tb_use as a
group by id,datee -- 1.这里去重是为了防止重复登录的
) as b
where date_sub(cast(b.datee as date),1)=cast(b.date1 as date);


-- mysql版
select
b.id as c1
from
(
select id ,datee,
lead(datee,1,-1) over(partition by id order by datee desc )  as date1
from tb_use as a
group by id,datee
) as b
where date_sub(cast(b.datee as date),1)=cast(b.date1 as date);

在这里插入图片描述

4.解法2：row_number搭配date_sub函数
解法网址：https://www.zhihu.com/people/chen-zhi-13-51
感谢这位作者，拖他的福我才会的

SELECT  id
FROM    (
            -- step3：用date_sub函数，发现如果连续登录的话，ds是一样的
            SELECT  *
                    ,DATE_SUB(datee,INTERVAL rn DAY) AS ds
            FROM    (
                        -- step2：加个开窗函数，为第三步做准备
                        SELECT  *
                                ,row_number() over(PARTITION BY id ORDER BY datee) AS rn
                        FROM    (
                        -- step1：首先对id、日期进行分组去重，因为一个用户每天会多次登录，生成tmp1表
                                    SELECT  *
                                    FROM    tb_use
                                    GROUP BY id
                                             ,datee
                                ) tmp1
                    ) tmp2
        ) tmp3
GROUP BY id
         ,ds
HAVING  count( *) >2;