手写HQL第一题
表名:score; 表结构:uid,subject_id,score
求:找出所有科目成绩都大于某一学科平均成绩的学生数据集如下
1001 | 01 | 90 |
1001 | 02 | 90 |
1001 | 03 | 90 |
1002 | 01 | 85 |
1002 | 02 | 85 |
1002 | 03 | 70 |
1003 | 01 | 70 |
1003 | 02 | 70 |
1003 | 03 | 85 |
数据
insert into score values('1001','01',90);
insert into score values('1001','02',90);
insert into score values('1001','03',90);
insert into score values('1002','01',85);
insert into score values('1002','02',85);
insert into score values('1002','03',70);
insert into score values('1003','01',70);
insert into score values('1003','02',70);
insert into score values('1003','03',85);
1) 建表语句
create table score (
uid string,
subject_id string,
score int)
row format delimited fields terminated by '\t';
2) 求出每个学科的平均成绩
select u_id, score, avg(score) over(partition by subject_id) avg_score
from score; t1
3) 根据是否大于平均成绩记录flag,大于则记为0否则记为1
select uid, if (score>avg_score,0,1) flag
from t1; t2
4) 根据学生id分组统计flag的和,和为0则是所有学科都大于平均成绩
select uid
from t2
group by uid
having sum(flag) = 0;
5) 最终SQL
select uid
from
(
select uid, if (score>avg_score,0,1) flag
from
(
select uid, score, avg(score) over(partition by subject_id) avg_score
from score
) t1
) t2
group by uid
having sum(flag) = 0;
手写HQL第二题
我们有如下的用户访问数据
userId |
visitDate |
visitCount |
u01 |
2017/1/21 |
5 |
u02 |
2017/1/23 |
6 |
u03 |
2017/1/22 |
8 |
u04 |
2017/1/20 |
3 |
u01 |
2017/1/23 |
6 |
u01 |
2017/2/21 |
8 |
u02 |
2017/1/23 |
6 |
u01 |
2017/2/22 |
4 |
要求使用SQL统计出每个用户的累积访问次数,如下表所示:
用户id |
月份 |
小计 |
累积 |
u01 |
2017-01 |
11 |
11 |
u01 |
2017-02 |
12 |
23 |
u02 |
2017-01 |
12 |
12 |
u03 |
2017-01 |
8 |
8 |
u04 |
2017-01 |
3 |
3 |
数据
insert into action values('u01','2017/1/21',5);
insert into action values('u02','2017/1/23',6);
insert into action values('u03','2017/1/22',8);
insert into action values('u04','2017/1/20',3);
insert into action values('u01','2017/1/23',6);
insert into action values('u01','2017/2/21',8);
insert into action values('u02','2017/1/23',6);
insert into action values('u01','2017/2/22',4);
1) 建表语句
create table action (
userId string,
visitDate string,
visitCount int)
row format delimited fields terminated by '\t';
2) 修改数据格式
select userId, date_format(regexp_replace(visitDate, '/', '-'), 'yyyy-MM') mn, visitCount
from action; t1
3) 计算每人单月访问量
select userId, mn, sum(visitCount) mn_count
from t1
group by userId, mn; t2
4) 按月累计访问量
select userId, mn, mn_count, sum(mn_count) over(partition by userId order by mn)
from t2;
5) 最终SQL
select userId, mn, mn_count, sum(mn_count) over(partition by userId order by mn)
from
(
select userId, mn, sum(visitCount) mn_count
from
(
select userId, date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn, visitCount
from action
) t1
group by userId, mn
) t2;
手写HQL第三题
有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:
1)每个店铺的UV(访客数)
2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
数据
insert into visit values(3,'女装');
insert into visit values(4,'女装');
insert into visit values(5,'女装');
insert into visit values(6,'女装');
insert into visit values(7,'女装');
insert into visit values(8,'女装');
insert into visit values(8,'女装');
insert into visit values(9,'女装');
insert into visit values(10,'女装');
insert into visit values(11,'女装');
insert into visit values(1,'男装');
insert into visit values(1,'男装');
insert into visit values(1,'男装');
insert into visit values(1,'男装');
insert into visit values(2,'男装');
insert into visit values(3,'男装');
insert into visit values(4,'男装');
1) 建表语句
create table visit (
user_id string,
shop string)
row format delimited fields terminated by '\t';
2) 每个店铺的UV(访客数)
select shop, count(distinct user_id)
from visit
group by shop;
3) 每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
一个访客可以多次访问一个店铺
3.1) 查询每个店铺被每个用户访问的次数
select shop, user_id, count(*) as cnt
from visit
group by shop, user_id; t1
3.2) 计算每个店铺被用户访问的排名
select shop, user_id, cnt, rank() over (partition by shop order by cnt desc) rk
from t1; t2
3.3) 取每个店铺前三名
select shop, user_id, cnt
from
(
select shop, user_id, cnt, rank() over (partition by shop order by cnt desc) rk
from
(
select shop, user_id, count(*) cnt
from visit
group by shop, user_id
) t1
) t2
where rk <= 3;
手写HQL第四题
已知一个表STG.ORDER(order_tab),有如下字段: dt,order_id,user_id,amount。请给出sql进行统计:数据样例:2017-01-01,10029028,1000003251,33.57。
1)给出 2017年每个月的订单数、用户数、总成交金额。
2)给出2017年11月的新客数(指在11月才有第一笔订单)
数据
insert into order_tab values('2017-01-01', '10029028', '1000003251', 33.57);
1) 建表语句
create table order_tab (
dt string,
order_id string,
user_id string,
amount decimal(10,2))
row format delimited fields terminated by '\t';
2) 给出 2017年每个月的订单数、用户数、总成交金额。
订单号不会重复
select date_format(dt,'yyyy-MM'), count(order_id), count(distinct user_id), sum(amount)
from order_tab
where date_format(dt,'yyyy') = '2017'
group by date_format(dt,'yyyy-MM');
3) 给出2017年11月的新客数(指在11月才有第一笔订单)
select count(user_id)
from order_tab
group by user_id
having date_format(min(dt), 'yyyy-MM') = '2017-11'
手写HQL第五题
有一个5000万的用户文件(user_id,name,age) user_file,一个2亿记录的用户看电影的记录文件(user_id,url) film file,根据年龄段(每隔10岁算一个年龄段)观看电影的次数进行排序?
数据准备
INSERT INTO TABLE user_file VALUES ('001', 'u1', 10);
INSERT INTO TAB