2.2 连续3天登录用户
1. lead(2,9999)over()
user_id | date | lead2 | date_diff |
1001 | 2024/1/1 | 2024/1/3 | 2 |
1001 | 2024/1/2 | 2024/1/5 | 3 |
1001 | 2024/1/3 | 2024/1/6 | 3 |
1001 | 2024/1/5 | 9999/12/31 | xx |
1002 | 2024/1/6 | 9999/12/31 | xx |
where date_diff=3 筛选连续3天
distinct uiser_id
2. rank over()
user_id | date | rank | date_diff |
1001 | 2024/1/1 | 1 | 2023/12/31 |
1001 | 2024/1/2 | 2 | 2023/12/31 |
1001 | 2024/1/3 | 3 | 2023/12/31 |
1001 | 2024/1/5 | 4 | 2024/1/1 |
1002 | 2024/1/6 | 5 | 2024/1/1 |
distinct user_id
sum() over(date_diff) >=3
3. count() over( range betetween 1d preceding and 1 d following)
count 出前后在1d 范围内的数据
user_id | date | count |
1001 | 2024/1/1 | 2 |
1001 | 2024/1/2 | 3 |
1001 | 2024/1/3 | 3 |
1001 | 2024/1/5 | 2 |
1002 | 2024/1/6 | 2 |
distinct user_id
count>=3
2.3 查询各品类销售商品的种类数及销量最高的商品
品类 | 种类id | rank over(sum)(销量) | count over(销量) |
1 | 1 | 1 | 2 |
1 | 2 | 2 | 2 |
2 | 3 | 1 | 3 |
2 | 4 | 2 | 3 |
2 | 5 | 3 | 3 |
count over 将每个品类下的销售种类数添加在每一行后
select category_id,
category_name,
sku_id,
name,
order_num,
sku_cnt
from (
select od.sku_id,
sku.name,
sku.category_id,
cate.category_name,
order_num,
rank() over (partition by sku.category_id order by order_num desc) rk,
count(distinct od.sku_id) over (partition by sku.category_id) sku_cnt
from (
select sku_id,
sum(sku_num) order_num
from order_detail
group by sku_id
) od
left join
sku_info sku
on od.sku_id = sku.sku_id
left join
category_info cate
on sku.category_id = cate.category_id
) t1
where rk = 1;
2.14 向用户推荐朋友收藏的商品
现需要请向所有用户推荐其朋友收藏但是用户自己未收藏的商品,请从好友关系表(friendship_info)和收藏表(favor_info)中查询出应向哪位用户推荐哪些商品。期望结果如下
1. 查找朋友关系;union
2. left join 朋友喜欢的 friend_favor
3. left join 自己喜欢的 user_favor
4. 筛选自己喜欢是null的数据
select
distinct t1.user_id,
friend_favor.sku_id
from
(
select
user1_id user_id,
user2_id friend_id
from friendship_info
union
select
user2_id,
user1_id
from friendship_info
)t1
left join favor_info friend_favor
on t1.friend_id=friend_favor.user_id
left join favor_info user_favor
on t1.user_id=user_favor.user_id
and friend_favor.sku_id=user_favor.sku_id
where user_favor.sku_id is null;
2.40 同时在线最多的人数
-- 登录标记1 下线标记-1
select
login_ts l_time,
1 flag
from
user_login_detail
union
select
logout_ts l_time,
-1 flag
from
user_login_detail
-- 按照时间求和
select
sum(flag)over(order by t1.l_time) sum_l_time
from
(
select
login_ts l_time,
1 flag
from
user_login_detail
union
select
logout_ts l_time,
-1 flag
from
user_login_detail
)t1
-- 拿到最大值 就是同时在线最多人数
select
max(sum_l_time)
from
(
select
sum(flag)over(order by t1.l_time) sum_l_time
from
(
select
login_ts l_time,
1 flag
from
user_login_detail
union
select
logout_ts l_time,
-1 flag
from
user_login_detail
)t1
)t2
题目
2.10 查询销售件数高于品类平均数的商品
从订单明细表(order_detail)中查询累积销售件数高于其所属品类平均数的商品
select sku_id,
name,
sum_num,
cate_avg_num
from (
select od.sku_id,
category_id,
name,
sum_num,
avg(sum_num) over (partition by category_id) cate_avg_num
from (
select sku_id,
sum(sku_num) sum_num
from order_detail
group by sku_id
) od
left join
(
select sku_id,
name,
category_id
from sku_info
) sku
on od.sku_id = sku.sku_id
) t1
where sum_num > cate_avg_num;
2.15 查询所有用户的连续登录两天及以上的日期区间
select user_id,
min(login_date) start_date,
max(login_date) end_date
from (
select user_id,
login_date,
date_sub(login_date, rn) flag
from (
select user_id,
login_date,
row_number() over (partition by user_id order by login_date) rn
from (
select user_id,
date_format(login_ts, 'yyyy-MM-dd') login_date
from user_login_detail
group by user_id, date_format(login_ts, 'yyyy-MM-dd')
) t1
) t2
) t3
group by user_id, flag
having count(*) >= 2;
2.18 购买过商品1和商品2但是没有购买商品3的顾客
select user_id
from (
select user_id,
collect_set(sku_id) skus
from order_detail od
left join
order_info oi
on od.order_id = oi.order_id
group by user_id
) t1
where array_contains(skus, '1')
and array_contains(skus, '2')
and !array_contains(skus, '3');
2.22 查询相同时刻多地登陆的用户
select
distinct t2.user_id
from
(
select
t1.user_id,
if(t1.max_logout is null ,2,if(t1.max_logout<t1.login_ts,1,0)) flag
from
(
select
user_id,
login_ts,
logout_ts,
max(logout_ts)over(partition by user_id order by login_ts rows between unbounded preceding and 1 preceding) max_logout
from
user_login_detail
)t1
)t2
where
t2.flag=0
2.29 求出商品连续售卖的时间区间--同2.15可用groupby
Sku_id(商品id) |
Start_date(起始时间) |
End_date(结束时间) |
1 |
2021-09-27 |
2021-09-27 |
1 |
2021-09-30 |
2021-10-01 |
1 |
2021-10-03 |
2021-10-08 |
10 |
2021-10-02 |
2021-10-03 |
10 |
2021-10-05 |
2021-10-08 |
-- 每个商品售卖的日期以及拿到按排序后日期的差值
select
sku_id,
create_date,
date_sub(create_date,rank()over(partition by sku_id order by create_date)) ddrk
from
order_detail
group by
sku_id,create_date
-- 拿到每次售卖的区间
select
distinct
sku_id,
first_value(t1.create_date)over(partition by t1.sku_id,t1.ddrk order by t1.create_date rows between unbounded preceding and unbounded following) start_date,
last_value(t1.create_date)over(partition by t1.sku_id,t1.ddrk order by t1.create_date rows between unbounded preceding and unbounded following) end_date
from
(
select
sku_id,
create_date,
date_sub(create_date,rank()over(partition by sku_id order by create_date)) ddrk
from
order_detail
group by
sku_id,create_date
)t1
2.33 查看每件商品的售价涨幅情况
-- 对每个商品按照修改日期倒序排序 并求出差值
select
sku_id,
new_price-lead(new_price,1,0)over(partition by sku_id order by change_date desc) price_change,
rank()over(partition by sku_id order by change_date desc) rk
from
sku_price_modify_detail t1
-- 最近一次修改的价格
select
t1.sku_id,
t1.price_change
from
(
select
sku_id,
new_price-lead(new_price,1,0)over(partition by sku_id order by change_date desc) price_change,
rank()over(partition by sku_id order by change_date desc) rk
from
sku_price_modify_detail
)t1
where
rk=1
order by
t1.price_change
2.34 销售订单首购和次购分析
select
distinct oi.user_id,
first_value(od.create_date)over(partition by oi.user_id order by od.create_date rows between unbounded preceding and unbounded following ) first_date,
last_value(od.create_date)over(partition by oi.user_id order by od.create_date rows between unbounded preceding and unbounded following ) last_date,
count(*)over(partition by oi.user_id order by od.create_date rows between unbounded preceding and unbounded following) cn
from
order_info oi
join
order_detail od
on
oi.order_id=od.order_id
join
sku_info si
on
od.sku_id=si.sku_id
where
si.name in('xiaomi 10','apple 12','xiaomi 13')
2.38 连续签到领金币数
用户每天签到可以领1金币,并可以累计签到天数,连续签到的第3、7天分别可以额外领2和6金币。
每连续签到7天重新累积签到天数。
从用户登录明细表中求出每个用户金币总数,并按照金币总数倒序排序
-- 求连续并标志是连续的第几天
select
t1.user_id,
t1.login_date,
date_sub(t1.login_date,t1.rk) login_date_rk,
count(*)over(partition by t1.user_id, date_sub(t1.login_date,t1.rk) order by t1.login_date) counti_cn
from
(
select
user_id,
date_format(login_ts,'yyyy-MM-dd') login_date,
rank()over(partition by user_id order by date_format(login_ts,'yyyy-MM-dd')) rk
from
user_login_detail
group by
user_id,date_format(login_ts,'yyyy-MM-dd')
)t1
--求出金币数量,以及签到奖励的金币数量
select
t2.user_id,
max(t2.counti_cn)+sum(if(t2.counti_cn%3=0,2,0))+sum(if(t2.counti_cn%7=0,6,0)) coin_cn
from
(
select
t1.user_id,
t1.login_date,
date_sub(t1.login_date,t1.rk) login_date_rk,
count(*)over(partition by t1.user_id, date_sub(t1.login_date,t1.rk) order by t1.login_date) counti_cn
from
(
select
user_id,
date_format(login_ts,'yyyy-MM-dd') login_date,
rank()over(partition by user_id order by date_format(login_ts,'yyyy-MM-dd')) rk
from
user_login_detail
group by
user_id,date_format(login_ts,'yyyy-MM-dd')
)t1
)t2
group by
t2.user_id,t2.login_date_rk
-- 求出每个用户的金币总数
select
t3.user_id,
sum(t3.coin_cn) sum_coin_cn
from
(
select
t2.user_id,
max(t2.counti_cn)+sum(if(t2.counti_cn%3=0,2,0))+sum(if(t2.counti_cn%7=0,6,0)) coin_cn
from
(
select
t1.user_id,
t1.login_date,
date_sub(t1.login_date,t1.rk) login_date_rk,
count(*)over(partition by t1.user_id, date_sub(t1.login_date,t1.rk) order by t1.login_date) counti_cn
from
(
select
user_id,
date_format(login_ts,'yyyy-MM-dd') login_date,
rank()over(partition by user_id order by date_format(login_ts,'yyyy-MM-dd')) rk
from
user_login_detail
group by
user_id,date_format(login_ts,'yyyy-MM-dd')
)t1
)t2
group by
t2.user_id,t2.login_date_rk
)t3
group by
t3.user_id
order by
sum_coin_cn desc
高级题目
2 会话划分问题
user_id |
page_id |
view_timestamp |
100 |
home |
1659950435 |
100 |
good_search |
1659950446 |
100 |
good_list |
1659950457 |
规定若同一用户的相邻两次访问记录时间间隔小于60s,则认为两次浏览记录属于同一会话。现有如下需求,为属于同一会话的访问记录增加一个相同的会话id字段,期望结果
user_id |
page_id |
view_timestamp |
session_id |
100 |
home |
1659950435 |
100-1 |
100 |
good_search |
1659950446 |
100-1 |
100 |
good_list |
1659950457 |
100-1 |
100 |
home |
1659950541 |
100-2 |
100 |
good_detail |
1659950552 |
100-2 |
select user_id,
page_id,
view_timestamp,
concat(user_id, '-', sum(session_start_point) over (partition by user_id order by view_timestamp)) session_id
from (
select user_id,
page_id,
view_timestamp,
if(view_timestamp - lagts >= 60, 1, 0) session_start_point
from (
select user_id,
page_id,
view_timestamp,
lag(view_timestamp, 1, 0) over (partition by user_id order by view_timestamp) lagts
from page_view_events
) t1
) t2;
3 间断连续登录用户问题(炸裂/会话)
现要求统计各用户最长的连续登录天数,间断一天也算作连续
3.1 补充为array然后炸裂
[d1,d2] |
d1 d2 |
-- 把日期炸裂开
SELECT
user_id
,login_data
,new_login_date
from(
-- 中间差一天的构成array
select
user_id
,login_data
,if (datediff(next_login_date,login_date)=2,array(login_data,date_add(login_data,1)),array(login_data)) as arr
from (
select
user_id
,login_data
,lead(login_data,1,'9999-12-31') over (partition by user_id order by login_data) next_login_date
from table
)t2
)t3 lateral view explode(arr) temp as new_login_date
3.2 打flag,时间求差
select
user_id,
max(recent_days) max_recent_days --求出每个用户最大的连续天数
from
(
select
user_id,
user_flag,
datediff(max(login_date),min(login_date)) + 1 recent_days --按照分组求每个用户每次连续的天数(记得加1)
-- 连续登录的天数
from
(
select
user_id,
login_date,
lag1_date,
concat(user_id,'_',flag_sum) user_flag --拼接用户和标签分组
from
(
select
user_id,
login_date,
lag1_date,
sum(flag) over(partition by user_id order by login_date) flag_sum
from(
select
user_id,
login_date,
lag1_date,
if(datediff(login_date,lag1_date)>2,1,0)) flag --获取大于2的标签
from
(
select
user_id,
login_date,
lag(login_date,1,'1970-01-01') over(partition by user_id order by login_date) lag1_date --获取上一次登录日期
from table
)t1
)t2
)t3
)t4
group by user_id,user_flag
)t5
group by user_id;
4 日期交叉问题
现要求统计每个品牌的优惠总天数,若某个品牌在同一天有多个优惠活动,则只按一天计算。
4.1 获取剔除重复的date,求差
select
brand,
sum(datediff(end_date,start_date)+1) promotion_day_count
-- 统计时间区间的长度
from
(
select
brand,
max_end_date,
if(max_end_date is null or start_date>max_end_date,start_date,date_add(max_end_date,1)) start_date,
-- 前一条结束时间在开始时间之后 或是第一条
end_date
from
(
select
brand,
start_date,
end_date,
max(end_date) over(partition by brand order by start_date rows between unbounded preceding and 1 preceding) max_end_date
-- 前一条的结束时间
from promotion_info
)t1
)t2
where end_date>start_date
group by brand;
4.2 扩充时间后炸裂去重
-- 日期去重
select
brand
,count(distinct event_date) as cnt
from(
-- 形成时间长度表
select
id
,brand
,start_date
,end_date
,date_add(start_date,pos) event_date
from (
-- 炸裂,形成时间长度表
select
id
,brand
,start_date
,end_date
,diff
,arr
,item -- 空
,pos -- 下标(rank)
from(
-- 获取需要炸裂行数的array,eg ['','','']
select
id
,brand
,start_date
,end_date
,datediff(end_date,start_date) + 1 as diff
, split(repeat(',',diff),',') -- 重复diff遍 按照,分割
from (
-- 获取连续日期的长度
select
id
,brand
,start_date
,end_date
,datediff(end_date,start_date) + 1 as diff
from table
) t1
)t2 lateral view posexplode(arr) tmp as pos,item
)t3
)t4
group by brand
其他
留存率
数据收缩:断点分组 拼接
炸裂函数
‘5-13’的起始和终止-5,3
select
min(t1.b)
,max(t1.b)
from xxx
lateral view explode(split(a,'-'))t1 as b
炸裂补全
根据某列(diff)数值炸裂补全 a
lateral view posexplode(split(repeate(',' cast(diff as int)),','))tem as pos,val
-- cast 转类型;
层级结构、状态标记;同2.18
select
oid
,sum(if array_contains(origin,old),1,0) nums
-- 是否在array内
from(
select
id
,oid
,collect_set(if(oid=0, id ,null)) over() as origin
-- 构成原创文章array
from table
)t1
where oid <>0
函数
regexp_replace:替换;regexp_replace('2020/02/14','/','-')
unix_timestamp:转为时间戳(s),由于datediff仅date类型加减,用timestamp进行运算后返回date_format