1.sql:Hive实现按照指定格式输出每七天的消费平均数
输出格式:
2018-06-01~2018-06-07 12.29
2018-06-08~2018-06-14 12.29
...
2018-08-10~2018-08-16 80.67
数据如下:
2018/6/1,10
2018/6/2,11
2018/6/3,11
2018/6/4,12
2018/6/5,14
2018/6/6,15
2018/6/7,13
2018/6/8,37
2018/6/9,18
2018/6/10,19
2018/6/11,10
2018/6/12,11
2018/6/13,11
2018/6/14,12
2018/6/15,14
2018/6/16,15
2018/6/17,13
2018/6/18,17
2018/6/19,18
2018/6/20,19
2018/6/21,20
2018/6/22,21
2018/6/23,21
2018/6/24,22
2018/6/25,24
2018/6/26,25
2018/6/27,23
2018/6/28,27
2018/6/29,28
2018/6/30,29
2018/7/1,40
2018/7/2,41
2018/7/3,41
2018/7/4,42
2018/7/5,44
2018/7/6,45
2018/7/7,43
2018/7/8,47
2018/7/9,48
2018/7/10,49
2018/7/11,50
2018/7/12,51
2018/7/13,51
2018/7/14,52
2018/7/15,54
2018/7/16,55
2018/7/17,53
2018/7/18,57
2018/7/19,58
2018/7/20,59
2018/7/21,30
2018/7/22,31
2018/7/23,31
2018/7/24,32
2018/7/25,34
2018/7/26,35
2018/7/27,33
2018/7/28,37
2018/7/29,38
2018/7/30,39
2018/7/31,70
2018/8/1,71
2018/8/2,71
2018/8/3,72
2018/8/4,74
2018/8/5,75
2018/8/6,73
2018/8/7,77
2018/8/8,78
2018/8/9,79
2018/8/10,80
2018/8/11,81
2018/8/12,81
建表语句:
create table user_cost
(
date_time string,
cost string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
解决思路:
1、将日期数据以指定的格式拼接,拼接之前需要进行切分
select concat_ws('-',split(date_time,'/')),cost from user_cost;
2、我们主要的实现思路是如何将7天分成一组,只要7天分成了一组,就可以根据组进行group by,首先我们可以将每个日期与2018-06-01号相减,得到天数。
select concat_ws('-',split(date_time,'/')),datediff(concat_ws('-',split(date_time,'/')),'2018-6-1'),cost from user_
cost;
3、将天数与7做除法运算,但是hive sql中除法运算是有小数部分的,所以要下取整
select concat_ws('-',split(date_time,'/')),floor(datediff(concat_ws('-',split(date_time,'/')),'2018-6-1')/7),cost f
rom user_cost;
4、现在如何将7天为1组的问题已经解决了,新的问题是如何将前面的日期格式变成我们想要的开始和结束格式?
解决方案:拿2018-6-1与每次除出来的整数*7相加,得到每组的开始时间,结束时间是开始时间+6,然后使用字符串拼接的方法将开始时间与结束时间进行拼接,得到分组日期
select concat(date_add('2018-6-1',cast(floor(datediff(concat_ws('-',split(date_time,'/')),'2018-6-1')/7)*7 as int))
,'~',date_add(date_add('2018-6-1',cast(floor(datediff(concat_ws('-',split(date_time,'/')),'2018-6-1')/7)*7 as int)),6)),cost from user_cost;
5、将相同的分组日期进行分组,求平均值
select t1.date_time,avg(cost) as weekend_avg_cost from (select concat(date_add('2018-6-1',cast(floor(datediff(conca
t_ws('-',split(date_time,'/')),'2018-6-1')/7)*7 as int)),'~',date_add(date_add('2018-6-1',cast(floor(datediff(concat_ws('-',split(date_time,'/')),'2018-6-1')/7)*7 as int)),6)) as date_time,cost from user_cost) t1 group by t1.date_time;
将下列数据加载hive表。
员工信息表emp:
字段:员工id,员工名字,工作岗位,部门经理,受雇日期,薪水,奖金,部门编号
英文名:EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,BONUS,DEPTNO
create table emp(
EMPNO int
,ENAME string
,JOB string
,MGR int
,HIREDATE string
,SAL int
,COMM int
,DEPTNO int
)
row format delimited
fields terminated by ',';
7369,SMITH,CLERK,7902,1980-12-17,800,null,20
7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300,30
7521,WARD,SALESMAN,7698,1981-02-22,1250,500,30
7566,JONES,MANAGER,7839,1981-04-02,2975,null,20,
7654,MARTIN,SALESMAN,7698,1981-09-28,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850,null,30
7782,CLARK,MANAGER,7839,1981-06-09,2450,null,10
7788,SCOTT,ANALYST,7566,1987-04-19,3000,null,20
7839,KING,PRESIDENT,null,1981-11-17,5000,null,10
7844,TURNER,SALESMAN,7698,1981-09-08,1500,0,30
7876,ADAMS,CLERK,7788,1987-05-23,1100,null,20
7900,JAMES,CLERK,7698,1981-12-03,950,null,30
7902,FORD,ANALYST,7566,1981-12-03,3000,null,20
7934,MILLER,CLERK,7782,1982-01-23,1300,null,10
1600 1250 2850 1500 950
部门信息表dept:
字段:部门编号,部门名称,部门地点
英文名:DEPTNO,DEPTNAME,DEPTADDR
create table dept(
DEPTNO int
,DEPTNAME string
,DEPTADDR string
)
row format delimited
fields terminated by ',';
10,ACCOUNTING,NEW YORK
20,RESEARCH,DALLAS
30,SALES,CHICAGO
40,OPERATIONS,BOSTON
二:使用HQL完成下面需求:
1. 列出至少有2个员工的所有部门。
select deptno,count(1) as counts from emp group by deptno having counts>=2;
2. 列出薪金比“SMITH”多的所有员工。(sal+comm)(制造列连接)
select t1.ename from (select *,1 as conn from emp) t1 join (select sal+n
vl(comm,0) as xinjin,1 as conn from emp where ename='SMITH') t2 on (t1.conn =t2.conn) where t1.sal+nvl(t1.comm,0)>t2.xinjin;
3. 列出所有员工的姓名及其直接上级的姓名。 (使用自连接)
select t1.ename,t2.ename from emp t1 join emp t2 on (t1.mgr=t2.empno);
4. 列出受雇日期早于其直接上级的所有员工。
select t1.ename,t2.ename,t1.hiredate,t2.hiredate from emp t1 join emp t2 on (t1.mgr=t2.empno) where t1.hiredate<t2.hiredate;
5. 列出部门名称和这些部门的员工信息,同时列出那些没有员工的部门。
select t1.empno,t1.ename,t1.job,t1.mgr,t1.hiredate,t1.sal,t1.comm,t2.* from emp t1 right join dept t2 on (t1.deptno=t2.deptno);
6. 列出所有“CLERK”(办事员)的姓名及其部门名称。(自己做)
select ename,deptname from (select ename,t2.deptname,job from emp t1 join dept t2 on (t1.deptno=t2.deptno)) tt1 where tt1.job='CLERK';
7. 列出最低薪金大于1500的各种工作。
select job,min(sal+nvl(comm,0)) as min_xinjin from emp group by job having min_xinjin>1500;
8. 列出在部门“SALES”(销售部)工作的员工的姓名,假定不知道销售部的部门编号(借鉴上面第二题)
select t1.ename from (select *,1 as conn from emp) t1 join (select deptno,1 as conn from dept where deptname='SALES') t2 on (t1.conn=t2.conn) where t1.deptno=t2.deptno;
9. 列出薪金高于公司平均薪金的所有员工。
select t1.ename from (select ename,(sal+nvl(comm,0)) as xinjin,1 as conn from emp) t1 join (select avg(sal+nvl(comm,0)) as avg_xinjin,1 as conn from emp) t2 on (t1.conn=t2.conn) where xinjin>avg_xinjin;
10.列出与“SCOTT”从事相同工作的所有员工。(借鉴上面第二题)
select t1.ename from (select *,1 as conn from emp) t1 join (select job,1 as conn from emp where ename='SCOTT') t2 on (t1.conn =t2.conn) where t1.job=t2.job and t1.ename !='SCOTT';
11.列出薪金等于部门30中员工的薪金的所有员工的姓名和薪金。
select t1.ename from (select ename,sal+nvl(comm,0) as xinjin from emp where deptno !=30) t1 join (select sal+nvl(comm,0) as xinjin from emp where deptno=30) t2 on (t1.xinjin=t2.xinjin);
12.列出薪金高于在部门30工作的所有员工的薪金的员工姓名和薪金。
select t1.ename,t1.xinjin from (select *,sal+nvl(comm,0) as xinjin,1 as tmp_col from emp) t1 join (select ename,sal+nvl(comm,0) as xinjin,1 as
tmp_col from emp where deptno=30) t2 on (t1.tmp_col=t2.tmp_col) where t1.xinjin=t2.xinjin and t1.deptno<>30;
13.列出在每个部门工作的员工数量、平均工资和平均服务期限。
select deptno,count(1),avg(sal),avg(datediff(current_date(),date_format(hiredate,'yyyy-MM-dd'))) from emp group by deptno;
14.列出所有员工的姓名、部门名称和工资。
select t1.ename,t2.deptname,t1.sal from emp t1 join dept t2 on(t1.deptno=t2.deptno);
15.列出所有部门的详细信息和部门人数。
select t1.*,nvl(t2.counts,0) as counts from dept t1 left join (select deptno,count(1) as counts from emp group by deptno) t2 on (t1.deptno=t2.deptno);
16.列出各种工作的最低工资。
select job,min(sal) from emp group by job;
17.列出各个部门的MANAGER(经理)的最低薪金。
select deptno,min(sal) from emp group by deptno having job='MANAGER';
18.列出所有员工的年工资,按年薪从低到高排序。
select ename,(sal+nvl(comm,0))*12 as year_sal from emp order by year_sal desc;
19. 列出每个部门薪水前两名最高的人员名称以及薪水。(思考:用开窗函数解决)
select ename,xinshui,rank,deptno from (select ename,rank() over(partition by deptno order by t1.xinshui desc) as rank,t1.xinshui,deptno from (select ename,sal+nvl(comm,0) as xinshui,deptno from emp) t1) t2 where rank<=2;
20. 列出每个员工从受雇开始到2018-12-12 为止共受雇了多少天。
select ename,datediff('2018-12-12',hiredate) as day from emp;
连续登陆问题
在电商、物流和银行可能经常会遇到这样的需求:统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数等
数据:
注意:每个用户每天可能会有多条记录
id datestr amount 1,2019-02-08,6214.23 1,2019-02-08,6247.32 1,2019-02-09,85.63 1,2019-02-09,967.36 1,2019-02-10,85.69 1,2019-02-12,769.85 1,2019-02-13,943.86 1,2019-02-14,538.42 1,2019-02-15,369.76 1,2019-02-16,369.76 1,2019-02-18,795.15 1,2019-02-19,715.65 1,2019-02-21,537.71 2,2019-02-08,6214.23 2,2019-02-08,6247.32 2,2019-02-09,85.63 2,2019-02-09,967.36 2,2019-02-10,85.69 2,2019-02-12,769.85 2,2019-02-13,943.86 2,2019-02-14,943.18 2,2019-02-15,369.76 2,2019-02-18,795.15 2,2019-02-19,715.65 2,2019-02-21,537.71 3,2019-02-08,6214.23 3,2019-02-08,6247.32 3,2019-02-09,85.63 3,2019-02-09,967.36 3,2019-02-10,85.69 3,2019-02-12,769.85 3,2019-02-13,943.86 3,2019-02-14,276.81 3,2019-02-15,369.76 3,2019-02-16,369.76 3,2019-02-18,795.15 3,2019-02-19,715.65 3,2019-02-21,537.71
建表语句
create table deal_tb( id string ,datestr string ,amount string )row format delimited fields terminated by ',';
计算逻辑
-
先按用户和日期分组求和,使每个用户每天只有一条数据
select id,datestr,sum(amount) as sum_smount from deal_tb group by id,datestr;
-
根据用户ID分组按日期排序,将日期和分组序号相减得到分组日期,如果开始日期相同说明连续登陆
select tt1.id as id,tt1.datestr as datestr,tt1.sum_amount as sum_amount,tt1.rn as rn,date_sub(tt1.datestr,rn) as gr p from (select t1.id as id,t1.datestr as datestr,t1.sum_amount as sum_amount,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) as sum_amount from deal_tb group by id,datestr) t1) tt1;
-
datediff(string end_date,string start_date); 等于0说明连续登录
select ttt1.id as id,round(sum(ttt1.sum_amount),2) as sum_amount,count(1) as continuous_days,min(ttt1.datestr) as s tart_time,max(ttt1.datestr) as end_time,datediff(ttt1.grp,lag(ttt1.grp,1) over(partition by ttt1.id order by ttt1.grp)) as interval_day from (select tt1.id as id,tt1.datestr as datestr,tt1.sum_amount as sum_amount,tt1.rn as rn,date_sub(tt1.datestr,rn) as grp from (select t1.id as id,t1.datestr as datestr,t1.sum_amount as sum_amount,row_number() over(partition by t1.id order by t1.datestr) as rn from (select id,datestr,sum(amount) as sum_amount from deal_tb group by id,datestr) t1) tt1) ttt1 group by ttt1.id,ttt1.grp;
-
统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数
select ttt1.id, ttt1.grp, sum(ttt1.sum_amount) as sum_over_amount, count(1) as lianxu_days, min(ttt1.datestr) as start_date, max(ttt1.datestr) as end_date, datediff( ttt1.grp,( lag(ttt1.grp, 1) over( partition by ttt1.id order by ttt1.grp ) ) ) as interval_days from ( select tt1.id as id, tt1.datestr as datestr, tt1.sum_amount as sum_amount, date_sub(tt1.datestr, tt1.rn) as grp from ( select t1.id as id, t1.datestr as datestr, t1.sum_amount as sum_amount, row_number() over( partition by t1.id order by t1.datestr ) as rn from ( select id, datestr, sum(amount) as sum_amount from deal_tb group by id, datestr ) t1 ) tt1 ) ttt1 group by ttt1.id, ttt1.grp;