1.
我们有如下的用户访问数据
userId | visitDate | visitCount |
---|---|---|
u01 | 2017/1/21 | 5 |
u02 | 2017/1/23 | 6 |
u03 | 2017/1/22 | 8 |
u04 | 2017/1/20 | 3 |
u01 | 2017/1/23 | 6 |
u01 | 2017/2/21 | 8 |
u02 | 2017/1/23 | 6 |
u01 | 2017/2/22 | 4 |
要求使用SQL统计出每个用户的累积访问次数,如下表所示:
用户id | 月份 | 小计 | 累积 |
---|---|---|---|
u01 | 2017-01 | 11 | 11 |
u01 | 2017-02 | 12 | 23 |
u02 | 2017-01 | 12 | 12 |
u03 | 2017-01 | 8 | 8 |
u04 | 2017-01 | 3 | 3 |
数据:
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
u02 2017/1/23 6
u01 2017/2/22 4
1)创建表
create table action
(userId string,
visitDate string,
visitCount int)
row format delimited fields terminated by "\t";
load data local inpath '/opt/module/hive/datas/action' into table action
思路一:
a. 先处理日期格式
SELECT userId,
DATE_FORMAT(regexp_replace(visitDate,"/","-"),'yyyy-MM') visitDate , visitCount
FROM ACTION ==>t1
b. 根据用户id和月份分组,求小计
SELECT t1.userId, t1.visitDate, SUM(visitCount) xj
FROM t1
GROUP BY userId,visitDate ==>t2
组合:
SELECT t1.userId, t1.visitDate, SUM(visitCount) xj
FROM (SELECT userId,
DATE_FORMAT(regexp_replace(visitDate,"/","-"),'yyyy-MM') visitDate , visitCount
FROM ACTION)t1
GROUP BY userId,visitDate
+------------+---------------+-----+--+
| t1.userid | t1.visitdate | xj |
+------------+---------------+-----+--+
| u01 | 2017-01 | 11 |
| u01 | 2017-02 | 12 |
| u02 | 2017-01 | 12 |
| u03 | 2017-01 | 8 |
| u04 | 2017-01 | 3 |
+------------+---------------+-----+--+
c. 开窗,通过userId分区,visitData排序, 求累计
SELECT t2.userId,t2.visitDate, t2.xj ,
SUM(t2.xj) over(PARTITION BY t2.userId ORDER BY t2.visitDate) lj
FROM t2
组合:
SELECT t2.userId,t2.visitDate, t2.xj ,
SUM(t2.xj) over(PARTITION BY t2.userId ORDER BY t2.visitDate) lj
FROM (SELECT t1.userId, t1.visitDate, SUM(visitCount) xj
FROM (SELECT userId,
DATE_FORMAT(regexp_replace(visitDate,"/","-"),'yyyy-MM') visitDate , visitCount
FROM ACTION)t1
GROUP BY userId,visitDate)t2
+------------+---------------+--------+-----+--+
| t2.userid | t2.visitdate | t2.xj | lj |
+------------+---------------+--------+-----+--+
| u01 | 2017-01 | 11 | 11 |
| u01 | 2017-02 | 12 | 23 |
| u02 | 2017-01 | 12 | 12 |
| u03 | 2017-01 | 8 | 8 |
| u04 | 2017-01 | 3 | 3 |
+------------+---------------+--------+-----+--+
思路二:
SELECT tmp.userId, tmp.mon,
sum(visitCount) over(PARTITION by tmp.userid,tmp.mon ORDER BY tmp.mon) s_m
from(
SELECT userId, SUBSTR(regexp_replace(visitDate, '(\/)', '-'),0,6) mon,visitCount
from action
) tmp
+-------------+----------+------+
| tmp.userid | tmp.mon | s_m |
+-------------+----------+------+
| u01 | 2017-1 | 11 |
| u01 | 2017-1 | 11 |
| u01 | 2017-2 | 12 |
| u01 | 2017-2 | 12 |
| u02 | 2017-1 | 12 |
| u02 | 2017-1 | 12 |
| u03 | 2017-1 | 8 |
| u04 | 2017-1 | 3 |
+-------------+----------+------+
SELECT DISTINCT tmp.userId, tmp.mon,
sum(visitCount) over(PARTITION by tmp.userid,tmp.mon ORDER BY tmp.mon) s_m
from(
SELECT userId, SUBSTR(regexp_replace(visitDate, '(\/)', '-'),0,6) mon,visitCount
from action
) tmp ;
+-------------+----------+------+
| tmp.userid | tmp.mon | s_m |
+-------------+----------+------+
| u01 | 2017-1 | 11 |
| u01 | 2017-2 | 12 |
| u02 | 2017-1 | 12 |
| u03 | 2017-1 | 8 |
| u04 | 2017-1 | 3 |
+-------------+----------+------+
SELECT DISTINCT tmp.userId, tmp.mon,
sum(visitCount) over(PARTITION by tmp.userid,tmp.mon ORDER BY tmp.mon) s_m,
sum(visitCount) over(PARTITION by userid ORDER BY mon) s_t
from(
SELECT userId, SUBSTR(regexp_replace(visitDate, '(\/)', '-'),0,6) mon,visitCount
from action
) tmp ;
+-------------+----------+------+------+
| tmp.userid | tmp.mon | s_m | s_t |
+-------------+----------+------+------+
| u01 | 2017-1 | 11 | 11 |
| u01 | 2017-2 | 12 | 23 |
| u02 | 2017-1 | 12 | 12 |
| u03 | 2017-1 | 8 | 8 |
| u04 | 2017-1 | 3 | 3 |
+-------------+----------+------+------+
5 rows selected (5.437 seconds)
SELECT DISTINCT userId, mon,
sum(visitCount) over(PARTITION by userid,mon ORDER BY mon) s_m,
sum(visitCount) over(PARTITION by userid ORDER BY mon) s_t
from(
SELECT userId, SUBSTR(regexp_replace(visitDate, '(\/)', '-'),0,6) mon,visitCount
from action
) tmp ;
+---------+---------+------+------+
| userid | mon | s_m | s_t |
+---------+---------+------+------+
| u01 | 2017-1 | 11 | 11 |
| u01 | 2017-2 | 12 | 23 |
| u02 | 2017-1 | 12 | 12 |
| u03 | 2017-1 | 8 | 8 |
| u04 | 2017-1 | 3 | 3 |
+---------+---------+------+------+
思路三:
select userid, Date_format(regexp_replace(visitdate,"/","-"),"yyyy-MM") visitdate,sum(visitcount) visitcount
from action
group by visitdate,userid
+---------+------------+-------------+
| userid | visitdate | visitcount |
+---------+------------+-------------+
| u01 | 2017-01 | 5 |
| u01 | 2017-01 | 6 |
| u01 | 2017-02 | 8 |
| u01 | 2017-02 | 4 |
| u02 | 2017-01 | 12 |
| u03 | 2017-01 | 8 |
| u04 | 2017-01 | 3 |
+---------+------------+-------------+
select userid,visitdate,sum(visitcount) sum_count
from(
select userid, Date_format(regexp_replace(visitdate,"/","-"),"yyyy-MM") visitdate,sum(visitcount) visitcount
from action
group by visitdate,userid
)t1
group by visitdate,userid
+---------+------------+------------+
| userid | visitdate | sum_count |
+---------+------------+------------+
| u01 | 2017-01 | 11 |
| u02 | 2017-01 | 12 |
| u03 | 2017-01 | 8 |
| u04 | 2017-01 | 3 |
| u01 | 2017-02 | 12 |
+---------+------------+------------+
select userid,visitdate,sum_count,
sum(sum_count) over(partition by userid order by visitdate) lj
from(
select userid,visitdate,sum(visitcount) sum_count
from(
select userid, Date_format(regexp_replace(visitdate,"/","-"),"yyyy-MM") visitdate,sum(visitcount) visitcount
from action
group by visitdate,userid
)t1
group by visitdate,userid
)t2
+---------+------------+------------+-----+
| userid | visitdate | sum_count | lj |
+---------+------------+------------+-----+
| u01 | 2017-01 | 11 | 11 |
| u01 | 2017-02 | 12 | 23 |
| u02 | 2017-01 | 12 | 12 |
| u03 | 2017-01 | 8 | 8 |
| u04 | 2017-01 | 3 | 3 |
+---------+------------+------------+-----+
5 rows selected (4.88 seconds)
2.京东
有50W个京东店铺,每个顾客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:
u1 a
u2 b
u1 b
u1 a
u3 c
u4 b
u1 a
u2 c
u5 b
u4 b
u6 c
u2 c
u1 b
u2 a
u2 a
u3 a
u5 a
u5 a
u5 a
建表:
create table visit(user_id string,shop string) row format delimited fields terminated by ‘\t’;
1)每个店铺的UV(访客数)
--每个店铺的UV(访客数)
--思路1:不去重
select shop,count(user_id)
from visit
group by shop
+-------+------+
| shop | _c1 |
+-------+------+
| a | 9 |
| b | 6 |
| c | 4 |
+-------+------+
--思路2:去重
SELECT shop ,COUNT(DISTINCT user_id) uv
FROM visit
GROUP BY shop
+-------+-----+
| shop | uv |
+-------+-----+
| a | 4 |
| b | 4 |
| c | 3 |
+-------+-----+
2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
方法一:
--每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
select shop,user_id,count(*) c_u
from visit
group by shop,user_id
order by shop,c_u desc
+-------+----------+------+
| shop | user_id | c_u |
+-------+----------+------+
| a | u1 | 3 |
| a | u5 | 3 |
| a | u2 | 2 |
| a | u3 | 1 |
| b | u1 | 2 |
| b | u4 | 2 |
| b | u2 | 1 |
| b | u5 | 1 |
| c | u2 | 2 |
| c | u3 | 1 |
| c | u6 | 1 |
+-------+----------+------+
select shop,user_id,c_u,
rank() over(partition by shop order by c_u desc) rs
from (
select shop,user_id,count(*) c_u
from visit
group by shop,user_id
)tmp_1
+-------+----------+------+-----+
| shop | user_id | c_u | rs |
+-------+----------+------+-----+
| a | u1 | 3 | 1 |
| a | u5 | 3 | 1 |
| a | u2 | 2 | 3 |
| a | u3 | 1 | 4 |
| b | u1 | 2 | 1 |
| b | u4 | 2 | 1 |
| b | u2 | 1 | 3 |
| b | u5 | 1 | 3 |
| c | u2 | 2 | 1 |
| c | u3 | 1 | 2 |
| c | u6 | 1 | 2 |
+-------+----------+------+-----+
select tmp_3.s_u,tmp_3.t,tmp_3.rt
from(
select tmp_2.s_u,tmp_2.t,
rank() over(partition by tmp_2.shop order by tmp_2.t desc) rt
from (
select tmp_1.shop,tmp_1.s_u,count(s_u) t
from (
select shop,concat_ws("_",shop,user_id) s_u
from visit
) tmp_1
group by tmp_1.s_u,tmp_1.shop
) tmp_2
)tmp_3
where tmp_3.rt<=3;
+-------+----------+------+-----+
| shop | user_id | c_u | rs |
+-------+----------+------+-----+
| a | u5 | 3 | 1 |
| a | u1 | 3 | 1 |
| a | u2 | 2 | 3 |
| b | u4 | 2 | 1 |
| b | u1 | 2 | 1 |
| b | u5 | 1 | 3 |
| b | u2 | 1 | 3 |
| c | u2 | 2 | 1 |
| c | u6 | 1 | 2 |
| c | u3 | 1 | 2 |
+-------+----------+------+-----+
10 rows selected (5.017 seconds)
方法二:
select concat_ws("_",shop,user_id)
from visit
+-------+
| _c0 |
+-------+
| a_u1 |
| b_u2 |
| b_u1 |
| a_u1 |
| c_u3 |
| b_u4 |
| a_u1 |
| c_u2 |
| b_u5 |
| b_u4 |
| c_u6 |
| c_u2 |
| b_u1 |
| a_u2 |
| a_u2 |
| a_u3 |
| a_u5 |
| a_u5 |
| a_u5 |
+-------+
select shop,s_u,count(s_u) t
from (
select shop,concat_ws("_",shop,user_id) s_u
from visit
) tmp_1
group by s_u,shop
+-------+-------+----+
| shop | s_u | t |
+-------+-------+----+
| a | a_u1 | 3 |
| a | a_u2 | 2 |
| a | a_u3 | 1 |
| a | a_u5 | 3 |
| b | b_u1 | 2 |
| b | b_u2 | 1 |
| b | b_u4 | 2 |
| b | b_u5 | 1 |
| c | c_u2 | 2 |
| c | c_u3 | 1 |
| c | c_u6 | 1 |
+-------+-------+----+
select *
from(
select tmp_2.s_u,tmp_2.t,
rank() over(partition by tmp_2.shop order by tmp_2.t desc) rt
from (
select tmp_1.shop,tmp_1.s_u,count(s_u) t
from (
select shop,concat_ws("_",shop,user_id) s_u
from visit
) tmp_1
group by tmp_1.s_u,tmp_1.shop
) tmp_2
)tmp_3
where tmp_3.rt<=3;
+------------+----------+-----------+
| tmp_3.s_u | tmp_3.t | tmp_3.rt |
+------------+----------+-----------+
| a_u5 | 3 | 1 |
| a_u1 | 3 | 1 |
| a_u2 | 2 | 3 |
| b_u4 | 2 | 1 |
| b_u1 | 2 | 1 |
| b_u5 | 1 | 3 |
| b_u2 | 1 | 3 |
| c_u2 | 2 | 1 |
| c_u6 | 1 | 2 |
| c_u3 | 1 | 2 |
+------------+----------+-----------+
10 rows selected (6.32 seconds)
3.蚂蚁金服
以下表记录了用户每天的蚂蚁森林低碳生活领取的记录流水。
table_name:user_low_carbon
user_id data_dt low_carbon
用户 日期 减少碳排放(g)
蚂蚁森林植物换购表,用于记录申领环保植物所需要减少的碳排放量
table_name: plant_carbon
plant_id plant_name low_carbon
植物编号 植物名 换购植物所需要的碳
----题目
1.蚂蚁森林植物申领统计
问题:假设2017年1月1日开始记录低碳数据(user_low_carbon),假设2017年10月1日之前满足申领条件的用户都申领了一颗p004-胡杨,
剩余的能量全部用来领取“p002-沙柳” 。
统计在10月1日累计申领“p002-沙柳” 排名前10的用户信息;以及他比后一名多领了几颗沙柳。
得到的统计结果如下表样式:
user_id plant_count less_count(比后一名多领了几颗沙柳)
u_101 1000 100
u_088 900 400
u_103 500 …
2、蚂蚁森林低碳用户排名分析
问题:查询user_low_carbon表中每日流水记录,条件为:
用户在2017年,连续三天(或以上)的天数里,
每天减少碳排放(low_carbon)都超过100g的用户低碳流水。
需要查询返回满足以上条件的user_low_carbon表中的记录流水。
例如用户u_002符合条件的记录如下,因为2017/1/2~2017/1/5连续四天的碳排放量之和都大于等于100g:
seq(key) user_id data_dt low_carbon
xxxxx10 u_002 2017/1/2 150
xxxxx11 u_002 2017/1/2 70
xxxxx12 u_002 2017/1/3 30
xxxxx13 u_002 2017/1/3 80
xxxxx14 u_002 2017/1/4 150
xxxxx14 u_002 2017/1/5 101
备注:统计方法不限于sql、procedure、python,java等
提供的数据说明:
user_low_carbon:
u_001 2017/1/1 10
u_001 2017/1/2 150
u_001 2017/1/2 110
u_001 2017/1/2 10
u_001 2017/1/4 50
u_001 2017/1/4 10
u_001 2017/1/6 45
u_001 2017/1/6 90
u_002 2017/1/1 10
u_002 2017/1/2 150
u_002 2017/1/2 70
u_002 2017/1/3 30
u_002 2017/1/3 80
u_002 2017/1/4 150
u_002 2017/1/5 101
u_002 2017/1/6 68
…
plant_carbon:
p001 梭梭树 17
p002 沙柳 19
p003 樟子树 146
p004 胡杨 215
…
1.创建表
create table user_low_carbon(user_id String,data_dt String,low_carbon int) row format delimited fields terminated by ‘\t’;
create table plant_carbon(plant_id string,plant_name String,low_carbon int) row format delimited fields terminated by ‘\t’;
2.加载数据
load data local inpath “/opt/module/hive/datas/user_low_carbon.txt” into table user_low_carbon;
load data local inpath “/opt/module/hive/datas/plant_carbon.txt” into table plant_carbon;
3.设置本地模式
set hive.exec.mode.local.auto=true;
+--------------------------+--------------------------+-----------------------------+
| user_low_carbon.user_id | user_low_carbon.data_dt | user_low_carbon.low_carbon |
+--------------------------+--------------------------+-----------------------------+
| u_001 | 2017/1/1 | 10 |
| u_001 | 2017/1/2 | 150 |
| u_001 | 2017/1/2 | 110 |
| u_001 | 2017/1/2 | 10 |
| u_001 | 2017/1/4 | 50 |
| u_001 | 2017/1/4 | 10 |
| u_001 | 2017/1/6 | 45 |
| u_001 | 2017/1/6 | 90 |
| u_002 | 2017/1/1 | 10 |
| u_002 | 2017/1/2 | 150 |
| u_002 | 2017/1/2 | 70 |
| u_002 | 2017/1/3 | 30 |
| u_002 | 2017/1/3 | 80 |
| u_002 | 2017/1/4 | 150 |
| u_002 | 2017/1/5 | 101 |
| u_002 | 2017/1/6 | 68 |
+--------------------------+--------------------------+-----------------------------+
+------------------------+--------------------------+--------------------------+
| plant_carbon.plant_id | plant_carbon.plant_name | plant_carbon.low_carbon |
+------------------------+--------------------------+--------------------------+
| p001 | 梭梭树 | 17 |
| p002 | 沙柳 | 19 |
| p003 | 樟子树 | 146 |
| p004 | 胡杨 | 215 |
+------------------------+--------------------------+--------------------------+
– 需求一
1.蚂蚁森林植物申领统计
问题:假设2017年1月1日开始记录低碳数据(user_low_carbon),
假设2017年10月1日之前满足申领条件的用户都申领了一颗p004-胡杨,
剩余的能量全部用来领取“p002-沙柳” 。
统计在10月1日之前累计申领“p002-沙柳” 排名前10的用户信息;
以及他比后一名多领了几颗沙柳
方式一:
-- 1. 处理日期格式 2017/1/1 == > 2017-01-01
SELECT
user_id ,
DATE_FORMAT(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd") data_dt ,
low_carbon
FROM
user_low_carbon ==>t1
-- 2. 求出2017年10月1日之前,每个用户的总能量
SELECT
t1.user_id,
SUM(t1.low_carbon) sum_low_carbon
FROM
t1
WHERE t1.data_dt < '2017-10-01'
GROUP BY t1.user_id
ORDER BY sum_low_carbon DESC
LIMIT 11 ==>t2
-- 组合
SELECT
t1.user_id,
SUM(t1.low_carbon) sum_low_carbon
FROM
(SELECT
user_id ,
DATE_FORMAT(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd") data_dt ,
low_carbon
FROM
user_low_carbon)t1
WHERE t1.data_dt < '2017-10-01'
GROUP BY t1.user_id
ORDER BY sum_low_carbon DESC
LIMIT 11
+-------------+-----------------+
| t1.user_id | sum_low_carbon |
+-------------+-----------------+
| u_007 | 1470 |
| u_013 | 1430 |
| u_008 | 1240 |
| u_005 | 1100 |
| u_010 | 1080 |
| u_014 | 1060 |
| u_011 | 960 |
| u_009 | 930 |
| u_006 | 830 |
| u_002 | 659 |
| u_004 | 640 |
+-------------+-----------------+
-- 3. 求出领取胡杨和沙柳需要的能量
SELECT low_carbon FROM plant_carbon WHERE plant_id = 'p004' ==> t3
SELECT low_carbon FROM plant_carbon WHERE plant_id = 'p002' ==> t4
-- 4. 求出每个人领取的沙柳的数量
SELECT
t2.user_id ,
t2.sum_low_carbon,
FLOOR((t2.sum_low_carbon - t3.low_carbon) / t4.low_carbon) sl
FROM
t2 ,
t3 ,
t4 ==>t5
-- 组合
SELECT
t2.user_id ,
t2.sum_low_carbon,
FLOOR((t2.sum_low_carbon - t3.low_carbon) / t4.low_carbon) sl
FROM
(SELECT
t1.user_id,
SUM(t1.low_carbon) sum_low_carbon
FROM
(SELECT
user_id ,
DATE_FORMAT(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd") data_dt ,
low_carbon
FROM
user_low_carbon)t1
WHERE t1.data_dt < '2017-10-01'
GROUP BY t1.user_id
ORDER BY sum_low_carbon DESC
LIMIT 11)t2 ,
(SELECT low_carbon FROM plant_carbon WHERE plant_id = 'p004')t3 ,
(SELECT low_carbon FROM plant_carbon WHERE plant_id = 'p002')t4
+-------------+--------------------+-----+
| t2.user_id | t2.sum_low_carbon | sl |
+-------------+--------------------+-----+
| u_007 | 1470 | 66 |
| u_013 | 1430 | 63 |
| u_008 | 1240 | 53 |
| u_005 | 1100 | 46 |
| u_010 | 1080 | 45 |
| u_014 | 1060 | 44 |
| u_011 | 960 | 39 |
| u_009 | 930 | 37 |
| u_006 | 830 | 32 |
| u_002 | 659 | 23 |
| u_004 | 640 | 22 |
+-------------+--------------------+-----+
-- 5. 将下一行的数据提取到当前行
SELECT
t5.user_id ,
t5.sum_low_carbon,
t5.sl,
lead(t5.sl,1) over(ORDER BY t5.sl DESC ) next_sl
FROM
t5
LIMIT 10 ==>t6
-- 组合
SELECT
t5.user_id ,
t5.sum_low_carbon,
t5.sl,
lead(t5.sl,1) over(ORDER BY t5.sl DESC ) next_sl
FROM
(SELECT
t2.user_id ,
t2.sum_low_carbon,
FLOOR((t2.sum_low_carbon - t3.low_carbon) / t4.low_carbon) sl
FROM
(SELECT
t1.user_id,
SUM(t1.low_carbon) sum_low_carbon
FROM
(SELECT
user_id ,
DATE_FORMAT(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd") data_dt ,
low_carbon
FROM
user_low_carbon)t1
WHERE t1.data_dt < '2017-10-01'
GROUP BY t1.user_id
ORDER BY sum_low_carbon DESC
LIMIT 11)t2 ,
(SELECT low_carbon FROM plant_carbon WHERE plant_id = 'p004')t3 ,
(SELECT low_carbon FROM plant_carbon WHERE plant_id = 'p002')t4 )t5
LIMIT 10
+-------------+--------------------+--------+----------+
| t5.user_id | t5.sum_low_carbon | t5.sl | next_sl |
+-------------+--------------------+--------+----------+
| u_007 | 1470 | 66 | 63 |
| u_013 | 1430 | 63 | 53 |
| u_008 | 1240 | 53 | 46 |
| u_005 | 1100 | 46 | 45 |
| u_010 | 1080 | 45 | 44 |
| u_014 | 1060 | 44 | 39 |
| u_011 | 960 | 39 | 37 |
| u_009 | 930 | 37 | 32 |
| u_006 | 830 | 32 | 23 |
| u_002 | 659 | 23 | 22 |
+-------------+--------------------+--------+----------+
-- 计算比下个人多领取多少颗沙柳
SELECT
t6.user_id ,
t6.sum_low_carbon,
t6.sl,
t6.next_sl,
t6.sl-t6.next_sl sl_diff
FROM t6
-- 最终组合 :
SELECT
t6.user_id ,
t6.sum_low_carbon,
t6.sl,
t6.next_sl,
t6.sl-t6.next_sl sl_diff
FROM
(SELECT
t5.user_id ,
t5.sum_low_carbon,
t5.sl,
lead(t5.sl,1) over(ORDER BY t5.sl DESC ) next_sl
FROM
(SELECT
t2.user_id ,
t2.sum_low_carbon,
FLOOR((t2.sum_low_carbon - t3.low_carbon) / t4.low_carbon) sl
FROM
(SELECT
t1.user_id,
SUM(t1.low_carbon) sum_low_carbon
FROM
(SELECT
user_id ,
DATE_FORMAT(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd") data_dt ,
low_carbon
FROM
user_low_carbon)t1
WHERE t1.data_dt < '2017-10-01'
GROUP BY t1.user_id
ORDER BY sum_low_carbon DESC
LIMIT 11)t2 ,
(SELECT low_carbon FROM plant_carbon WHERE plant_id = 'p004')t3 ,
(SELECT low_carbon FROM plant_carbon WHERE plant_id = 'p002')t4 )t5
LIMIT 10)t6
+-------------+--------------------+--------+-------------+----------+
| t6.user_id | t6.sum_low_carbon | t6.sl | t6.next_sl | sl_diff |
+-------------+--------------------+--------+-------------+----------+
| u_007 | 1470 | 66 | 63 | 3 |
| u_013 | 1430 | 63 | 53 | 10 |
| u_008 | 1240 | 53 | 46 | 7 |
| u_005 | 1100 | 46 | 45 | 1 |
| u_010 | 1080 | 45 | 44 | 1 |
| u_014 | 1060 | 44 | 39 | 5 |
| u_011 | 960 | 39 | 37 | 2 |
| u_009 | 930 | 37 | 32 | 5 |
| u_006 | 830 | 32 | 23 | 9 |
| u_002 | 659 | 23 | 22 | 1 |
+-------------+--------------------+--------+-------------+----------+
方式二:
select user_id,sum(low_carbon)
from user_low_carbon
group by user_id;
+----------+-------+
| user_id | _c1 |
+----------+-------+
| u_001 | 475 |
| u_002 | 659 |
| u_003 | 620 |
| u_004 | 640 |
| u_005 | 1100 |
| u_006 | 830 |
| u_007 | 1470 |
| u_008 | 1240 |
| u_009 | 930 |
| u_010 | 1080 |
| u_011 | 960 |
| u_012 | 250 |
| u_013 | 1430 |
| u_014 | 1060 |
| u_015 | 290 |
+----------+-------+
select low_carbon p04
from plant_carbon
where plant_id='p004'
select user_id,
sum(low_carbon)-(select low_carbon from plant_carbon where plant_id='p004') s_l
from user_low_carbon
group by user_id
+----------+-------+
| user_id | s_l |
+----------+-------+
| u_001 | 260 |
| u_015 | 75 |
| u_014 | 845 |
| u_013 | 1215 |
| u_012 | 35 |
| u_011 | 745 |
| u_010 | 865 |
| u_009 | 715 |
| u_008 | 1025 |
| u_007 | 1255 |
| u_006 | 615 |
| u_005 | 885 |
| u_004 | 425 |
| u_003 | 405 |
| u_002 | 444 |
+----------+-------+
--1
select distinct user_id,
floor((sum(low_carbon) over(partition by user_id )-(select low_carbon from plant_carbon where plant_id='p004'))/
(select low_carbon p02 from plant_carbon where plant_id='p002' ) )plant_count
from user_low_carbon
order by plant_count desc
--2
select user_id,floor((t-(select low_carbon from plant_carbon where plant_id='p004'))/
(select low_carbon p02 from plant_carbon where plant_id='p002' ) )plant_count
from(
select distinct user_id,
sum(low_carbon) over(partition by user_id ) t
from user_low_carbon
) tmp_1
order by plant_count desc
--3
select user_id,
floor((sum(low_carbon)-(select low_carbon from plant_carbon where plant_id='p004'))/
(select low_carbon p02 from plant_carbon where plant_id='p002' )) plant_count
from user_low_carbon
group by user_id
order by plant_count desc
+----------+--------------+
| user_id | plant_count |
+----------+--------------+
| u_007 | 66 |
| u_013 | 63 |
| u_008 | 53 |
| u_005 | 46 |
| u_010 | 45 |
| u_014 | 44 |
| u_011 | 39 |
| u_009 | 37 |
| u_006 | 32 |
| u_002 | 23 |
| u_004 | 22 |
| u_003 | 21 |
| u_001 | 13 |
| u_015 | 3 |
| u_012 | 1 |
+----------+--------------+
select user_id,plant_count,
plant_count-(lead(plant_count,1) over(order by plant_count desc)) less_count
from(
select distinct user_id,
floor((sum(low_carbon) over(partition by user_id )-(select low_carbon from plant_carbon where plant_id='p004'))/
(select low_carbon p02 from plant_carbon where plant_id='p002' ) )plant_count
from user_low_carbon
) tmp
+----------+--------------+-------------+
| user_id | plant_count | less_count |
+----------+--------------+-------------+
| u_007 | 66 | 3 |
| u_013 | 63 | 10 |
| u_008 | 53 | 7 |
| u_005 | 46 | 1 |
| u_010 | 45 | 1 |
| u_014 | 44 | 5 |
| u_011 | 39 | 2 |
| u_009 | 37 | 5 |
| u_006 | 32 | 9 |
| u_002 | 23 | 1 |
| u_004 | 22 | 1 |
| u_003 | 21 | 8 |
| u_001 | 13 | 10 |
| u_015 | 3 | 2 |
| u_012 | 1 | NULL |
+----------+--------------+-------------+
select user_id,plant_count,
plant_count-(lead(plant_count,1) over(order by plant_count desc)) less_count
from(
select distinct user_id,
floor((sum(low_carbon) over(partition by user_id )-(select low_carbon from plant_carbon where plant_id='p004'))/
(select low_carbon p02 from plant_carbon where plant_id='p002' ) )plant_count
from user_low_carbon
) tmp
limit 10;
+----------+--------------+-------------+
| user_id | plant_count | less_count |
+----------+--------------+-------------+
| u_007 | 66 | 3 |
| u_013 | 63 | 10 |
| u_008 | 53 | 7 |
| u_005 | 46 | 1 |
| u_010 | 45 | 1 |
| u_014 | 44 | 5 |
| u_011 | 39 | 2 |
| u_009 | 37 | 5 |
| u_006 | 32 | 9 |
| u_002 | 23 | 1 |
+----------+--------------+-------------+
10 rows selected (13.457 seconds)
整理:
select user_id,date_format(regexp_replace(data_dt,'/','-'),"yyyy-MM-dd") dt,low_carbon
from user_low_carbon
select user_id,sum(low_carbon) low_carbon
from (
select user_id,date_format(regexp_replace(data_dt,'/','-'),"yyyy-MM-dd") dt,low_carbon
from user_low_carbon
)t1
where dt <'2017-10-01'
group by user_id
+----------+-------------+
| user_id | low_carbon |
+----------+-------------+
| u_001 | 475 |
| u_002 | 659 |
| u_003 | 620 |
| u_004 | 640 |
| u_005 | 1100 |
| u_006 | 830 |
| u_007 | 1470 |
| u_008 | 1240 |
| u_009 | 930 |
| u_010 | 1080 |
| u_011 | 960 |
| u_012 | 250 |
| u_013 | 1430 |
| u_014 | 1060 |
| u_015 | 290 |
+----------+-------------+
select user_id,
floor((sum(low_carbon)-(select low_carbon from plant_carbon where plant_id='p004'))/
(select low_carbon from plant_carbon where plant_id='p002')) sl
from (
select user_id,date_format(regexp_replace(data_dt,'/','-'),"yyyy-MM-dd") dt,low_carbon
from user_low_carbon
)t1
where dt <'2017-10-01'
group by user_id
order by sl desc
limit 11
+----------+-----+
| user_id | sl |
+----------+-----+
| u_007 | 66 |
| u_013 | 63 |
| u_008 | 53 |
| u_005 | 46 |
| u_010 | 45 |
| u_014 | 44 |
| u_011 | 39 |
| u_009 | 37 |
| u_006 | 32 |
| u_002 | 23 |
| u_004 | 22 |
+----------+-----+
select user_id,sl,
(sl-lead(sl,1) over(order by sl desc))gap
from(
select user_id,
floor((sum(low_carbon)-(select low_carbon from plant_carbon where plant_id='p004'))/
(select low_carbon from plant_carbon where plant_id='p002')) sl
from (
select user_id,date_format(regexp_replace(data_dt,'/','-'),"yyyy-MM-dd") dt,low_carbon
from user_low_carbon
)t1
where dt <'2017-10-01'
group by user_id
order by sl desc
limit 11
)t2
limit 10
+----------+-----+------+
| user_id | sl | gap |
+----------+-----+------+
| u_007 | 66 | 3 |
| u_013 | 63 | 10 |
| u_008 | 53 | 7 |
| u_005 | 46 | 1 |
| u_010 | 45 | 1 |
| u_014 | 44 | 5 |
| u_011 | 39 | 2 |
| u_009 | 37 | 5 |
| u_006 | 32 | 9 |
| u_002 | 23 | 1 |
+----------+-----+------+
10 rows selected (14.748 seconds)
– 需求二
2、蚂蚁森林低碳用户排名分析
问题:查询user_low_carbon表中每日流水记录,条件为:
用户在2017年,连续三天(或以上)的天数里,
每天减少碳排放(low_carbon)都超过100g的用户低碳流水。
需要查询返回满足以上条件的user_low_carbon表中的记录流水。
select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) s
from user_low_carbon
group by user_id,data_dt
having s>=100
+----------+-----------+------+
| user_id | data_dt | s |
+----------+-----------+------+
| u_001 | 2017-1-2 | 270 |
| u_001 | 2017-1-6 | 135 |
| u_002 | 2017-1-2 | 220 |
| u_002 | 2017-1-3 | 110 |
| u_002 | 2017-1-4 | 150 |
| u_002 | 2017-1-5 | 101 |
| u_003 | 2017-1-2 | 160 |
| u_003 | 2017-1-3 | 160 |
| u_003 | 2017-1-5 | 120 |
| u_003 | 2017-1-7 | 120 |
| u_004 | 2017-1-1 | 110 |
| u_004 | 2017-1-3 | 120 |
| u_004 | 2017-1-6 | 120 |
| u_004 | 2017-1-7 | 130 |
| u_005 | 2017-1-2 | 130 |
| u_005 | 2017-1-3 | 180 |
| u_005 | 2017-1-4 | 190 |
| u_005 | 2017-1-6 | 280 |
| u_005 | 2017-1-7 | 160 |
| u_006 | 2017-1-2 | 180 |
| u_006 | 2017-1-3 | 220 |
| u_006 | 2017-1-7 | 290 |
| u_007 | 2017-1-1 | 130 |
| u_007 | 2017-1-2 | 360 |
| u_007 | 2017-1-4 | 530 |
| u_007 | 2017-1-6 | 230 |
| u_007 | 2017-1-7 | 160 |
| u_008 | 2017-1-1 | 160 |
| u_008 | 2017-1-2 | 120 |
| u_008 | 2017-1-4 | 260 |
| u_008 | 2017-1-5 | 360 |
| u_008 | 2017-1-6 | 160 |
| u_008 | 2017-1-7 | 120 |
| u_009 | 2017-1-2 | 140 |
| u_009 | 2017-1-3 | 170 |
| u_009 | 2017-1-4 | 270 |
| u_009 | 2017-1-7 | 140 |
| u_010 | 2017-1-2 | 180 |
| u_010 | 2017-1-4 | 170 |
| u_010 | 2017-1-5 | 180 |
| u_010 | 2017-1-6 | 190 |
| u_010 | 2017-1-7 | 180 |
| u_011 | 2017-1-1 | 110 |
| u_011 | 2017-1-2 | 200 |
| u_011 | 2017-1-3 | 120 |
| u_011 | 2017-1-4 | 100 |
| u_011 | 2017-1-5 | 100 |
| u_011 | 2017-1-6 | 100 |
| u_011 | 2017-1-7 | 230 |
| u_012 | 2017-1-2 | 130 |
| u_013 | 2017-1-2 | 200 |
| u_013 | 2017-1-3 | 150 |
| u_013 | 2017-1-4 | 550 |
| u_013 | 2017-1-5 | 350 |
| u_014 | 2017-1-1 | 220 |
| u_014 | 2017-1-2 | 140 |
| u_014 | 2017-1-5 | 250 |
| u_014 | 2017-1-6 | 120 |
| u_014 | 2017-1-7 | 290 |
| u_015 | 2017-1-7 | 140 |
+----------+-----------+------+
select user_id,data_dt, c
from(
select user_id,data_dt,
count(s) over (partition by user_id ) c
from (
select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) s
from user_low_carbon
group by user_id,data_dt
having s>=100
) tmp_1
)tmp_2
where c>=3
+----------+-----------+----+
| user_id | data_dt | c |
+----------+-----------+----+
| u_002 | 2017-1-2 | 4 |
| u_002 | 2017-1-3 | 4 |
| u_002 | 2017-1-4 | 4 |
| u_002 | 2017-1-5 | 4 |
| u_003 | 2017-1-2 | 4 |
| u_003 | 2017-1-3 | 4 |
| u_003 | 2017-1-5 | 4 |
| u_003 | 2017-1-7 | 4 |
| u_004 | 2017-1-1 | 4 |
| u_004 | 2017-1-3 | 4 |
| u_004 | 2017-1-6 | 4 |
| u_004 | 2017-1-7 | 4 |
| u_005 | 2017-1-2 | 5 |
| u_005 | 2017-1-3 | 5 |
| u_005 | 2017-1-4 | 5 |
| u_005 | 2017-1-6 | 5 |
| u_005 | 2017-1-7 | 5 |
| u_006 | 2017-1-2 | 3 |
| u_006 | 2017-1-3 | 3 |
| u_006 | 2017-1-7 | 3 |
| u_007 | 2017-1-1 | 5 |
| u_007 | 2017-1-2 | 5 |
| u_007 | 2017-1-4 | 5 |
| u_007 | 2017-1-6 | 5 |
| u_007 | 2017-1-7 | 5 |
| u_008 | 2017-1-1 | 6 |
| u_008 | 2017-1-2 | 6 |
| u_008 | 2017-1-4 | 6 |
| u_008 | 2017-1-5 | 6 |
| u_008 | 2017-1-6 | 6 |
| u_008 | 2017-1-7 | 6 |
| u_009 | 2017-1-2 | 4 |
| u_009 | 2017-1-3 | 4 |
| u_009 | 2017-1-4 | 4 |
| u_009 | 2017-1-7 | 4 |
| u_010 | 2017-1-2 | 5 |
| u_010 | 2017-1-4 | 5 |
| u_010 | 2017-1-5 | 5 |
| u_010 | 2017-1-6 | 5 |
| u_010 | 2017-1-7 | 5 |
| u_011 | 2017-1-1 | 7 |
| u_011 | 2017-1-2 | 7 |
| u_011 | 2017-1-3 | 7 |
| u_011 | 2017-1-4 | 7 |
| u_011 | 2017-1-5 | 7 |
| u_011 | 2017-1-6 | 7 |
| u_011 | 2017-1-7 | 7 |
| u_013 | 2017-1-2 | 4 |
| u_013 | 2017-1-3 | 4 |
| u_013 | 2017-1-4 | 4 |
| u_013 | 2017-1-5 | 4 |
| u_014 | 2017-1-1 | 5 |
| u_014 | 2017-1-2 | 5 |
| u_014 | 2017-1-5 | 5 |
| u_014 | 2017-1-6 | 5 |
| u_014 | 2017-1-7 | 5 |
+----------+-----------+----+
select user_id,data_dt, c,
lead(data_dt,3) over(partition by user_id order by data_dt) ld
from(
select user_id,data_dt,
count(s) over (partition by user_id ) c
from (
select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) s
from user_low_carbon
group by user_id,data_dt
having s>=100
) tmp_1
)tmp_2
where c>=3
+----------+-----------+----+-----------+
| user_id | data_dt | c | ld |
+----------+-----------+----+-----------+
| u_002 | 2017-1-2 | 4 | 2017-1-5 |
| u_002 | 2017-1-3 | 4 | NULL |
| u_002 | 2017-1-4 | 4 | NULL |
| u_002 | 2017-1-5 | 4 | NULL |
| u_003 | 2017-1-2 | 4 | 2017-1-7 |
| u_003 | 2017-1-3 | 4 | NULL |
| u_003 | 2017-1-5 | 4 | NULL |
| u_003 | 2017-1-7 | 4 | NULL |
| u_004 | 2017-1-1 | 4 | 2017-1-7 |
| u_004 | 2017-1-3 | 4 | NULL |
| u_004 | 2017-1-6 | 4 | NULL |
| u_004 | 2017-1-7 | 4 | NULL |
| u_005 | 2017-1-2 | 5 | 2017-1-6 |
| u_005 | 2017-1-3 | 5 | 2017-1-7 |
| u_005 | 2017-1-4 | 5 | NULL |
| u_005 | 2017-1-6 | 5 | NULL |
| u_005 | 2017-1-7 | 5 | NULL |
| u_006 | 2017-1-2 | 3 | NULL |
| u_006 | 2017-1-3 | 3 | NULL |
| u_006 | 2017-1-7 | 3 | NULL |
| u_007 | 2017-1-1 | 5 | 2017-1-6 |
| u_007 | 2017-1-2 | 5 | 2017-1-7 |
| u_007 | 2017-1-4 | 5 | NULL |
| u_007 | 2017-1-6 | 5 | NULL |
| u_007 | 2017-1-7 | 5 | NULL |
| u_008 | 2017-1-1 | 6 | 2017-1-5 |
| u_008 | 2017-1-2 | 6 | 2017-1-6 |
| u_008 | 2017-1-4 | 6 | 2017-1-7 |
| u_008 | 2017-1-5 | 6 | NULL |
| u_008 | 2017-1-6 | 6 | NULL |
| u_008 | 2017-1-7 | 6 | NULL |
| u_009 | 2017-1-2 | 4 | 2017-1-7 |
| u_009 | 2017-1-3 | 4 | NULL |
| u_009 | 2017-1-4 | 4 | NULL |
| u_009 | 2017-1-7 | 4 | NULL |
| u_010 | 2017-1-2 | 5 | 2017-1-6 |
| u_010 | 2017-1-4 | 5 | 2017-1-7 |
| u_010 | 2017-1-5 | 5 | NULL |
| u_010 | 2017-1-6 | 5 | NULL |
| u_010 | 2017-1-7 | 5 | NULL |
| u_011 | 2017-1-1 | 7 | 2017-1-4 |
| u_011 | 2017-1-2 | 7 | 2017-1-5 |
| u_011 | 2017-1-3 | 7 | 2017-1-6 |
| u_011 | 2017-1-4 | 7 | 2017-1-7 |
| u_011 | 2017-1-5 | 7 | NULL |
| u_011 | 2017-1-6 | 7 | NULL |
| u_011 | 2017-1-7 | 7 | NULL |
| u_013 | 2017-1-2 | 4 | 2017-1-5 |
| u_013 | 2017-1-3 | 4 | NULL |
| u_013 | 2017-1-4 | 4 | NULL |
| u_013 | 2017-1-5 | 4 | NULL |
| u_014 | 2017-1-1 | 5 | 2017-1-6 |
| u_014 | 2017-1-2 | 5 | 2017-1-7 |
| u_014 | 2017-1-5 | 5 | NULL |
| u_014 | 2017-1-6 | 5 | NULL |
| u_014 | 2017-1-7 | 5 | NULL |
+----------+-----------+----+-----------+
select distinct user_id
from(
select user_id,data_dt, c,
datediff(lead(data_dt,3) over(partition by user_id order by data_dt) ,data_dt) gap
from(
select user_id,data_dt,
count(s) over (partition by user_id ) c
from (
select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) s
from user_low_carbon
group by user_id,data_dt
having s>=100
) tmp_1
)tmp_2
where c>=3
)tmp_3
where gap=3
+----------+
| user_id |
+----------+
| u_002 |
| u_008 |
| u_010 |
| u_011 |
| u_013 |
+----------+
select user_id,data_dt,low_carbon
from user_low_carbon
where user_id in
(
select distinct user_id
from(
select user_id,data_dt, c,
datediff(lead(data_dt,3) over(partition by user_id order by data_dt) ,data_dt) gap
from(
select user_id,data_dt,
count(s) over (partition by user_id ) c
from (
select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) s
from user_low_carbon
group by user_id,data_dt
having s>=100
) tmp_1
)tmp_2
where c>=3
)tmp_3
where gap=3
)
order by user_id,data_dt;
+----------+-----------+-------------+
| u_002 | 2017/1/1 | 10 |
| u_002 | 2017/1/2 | 70 |
| u_002 | 2017/1/2 | 150 |
| u_002 | 2017/1/3 | 80 |
| u_002 | 2017/1/3 | 30 |
| u_002 | 2017/1/4 | 150 |
| u_002 | 2017/1/5 | 101 |
| u_002 | 2017/1/6 | 68 |
| u_008 | 2017/1/1 | 160 |
| u_008 | 2017/1/2 | 60 |
| u_008 | 2017/1/2 | 60 |
| u_008 | 2017/1/3 | 60 |
| u_008 | 2017/1/4 | 260 |
| u_008 | 2017/1/5 | 360 |
| u_008 | 2017/1/6 | 160 |
| u_008 | 2017/1/7 | 60 |
| u_008 | 2017/1/7 | 60 |
| u_010 | 2017/1/1 | 90 |
| u_010 | 2017/1/2 | 90 |
| u_010 | 2017/1/2 | 90 |
| u_010 | 2017/1/3 | 90 |
| u_010 | 2017/1/4 | 90 |
| u_010 | 2017/1/4 | 80 |
| u_010 | 2017/1/5 | 90 |
| u_010 | 2017/1/5 | 90 |
| u_010 | 2017/1/6 | 190 |
| u_010 | 2017/1/7 | 90 |
| u_010 | 2017/1/7 | 90 |
| u_011 | 2017/1/1 | 110 |
| u_011 | 2017/1/2 | 100 |
| u_011 | 2017/1/2 | 100 |
| u_011 | 2017/1/3 | 120 |
| u_011 | 2017/1/4 | 100 |
| u_011 | 2017/1/5 | 100 |
| u_011 | 2017/1/6 | 100 |
| u_011 | 2017/1/7 | 100 |
| u_011 | 2017/1/7 | 130 |
| u_013 | 2017/1/1 | 50 |
| u_013 | 2017/1/2 | 50 |
| u_013 | 2017/1/2 | 150 |
| u_013 | 2017/1/3 | 150 |
| u_013 | 2017/1/4 | 550 |
| u_013 | 2017/1/5 | 350 |
| u_013 | 2017/1/6 | 50 |
| u_013 | 2017/1/7 | 20 |
| u_013 | 2017/1/7 | 60 |
+----------+-----------+-------------+