窗口函数是最后执行,仅在order by之前。
创建表用于测试
USE sid;
CREATE TABLE buy_record(
id INT COMMENT'购买记录id',
user_name STRING COMMENT'用户名',
buy_date STRING COMMENT'购买日期',
goods_id INT COMMENT'商品id',
price DOUBLE COMMENT'商品价格'
)COMMENT '商品购买记录表'
STORED AS PARQUET;
CREATE TABLE user_access_log(
user_name STRING COMMENT'用户名',
time STRING COMMENT'访问时间',
url STRING COMMENT'访问URL'
)COMMENT '用户访问URL日志'
STORED AS PARQUET;
插入测试数据
INSERT INTO TABLE buy_record
VALUES (1, 'sid', '20180701',5,128),
(2, 'sid', '20180701',4,20),
(3, 'sid', '20180701',5,128),
(4, 'sid', '20180721',5,128),
(5, 'sid', '20180721',3,40),
(6, 'zhangsan', '20180701',5,128),
(7, 'zhangsan', '20180701',4,20),
(8, 'zhangsan', '20180721',3,40),
(9, 'lisi', '20180711',5,128),
(10, 'lisi', '20180711',4,20);
INSERT INTO TABLE user_access_log
VALUES ('sid', '2018-07-01 01:10:00','URL1'),
('sid', '2018-07-01 01:15:00','URL2'),
('sid', '2018-07-01 01:12:00','URL3'),
('sid', '2018-07-21 01:13:00','URL4'),
('sid', '2018-07-21 01:18:00','URL5'),
('zhangsan', '2018-07-01 01:20:00','URL1'),
('zhangsan', '2018-07-01 01:25:00','URL2'),
('zhangsan', '2018-07-21 01:22:00','URL3'),
('lisi', '2018-07-11 01:10:00','URL1'),
('lisi', '2018-07-11 01:10:40','URL2');
select * from buy_record;
1 sid 20180701 5 128.0
2 sid 20180701 4 20.0
3 sid 20180701 5 128.0
4 sid 20180721 5 128.0
5 sid 20180721 3 40.0
6 zhangsan 20180701 5 128.0
7 zhangsan 20180701 4 20.0
8 zhangsan 20180721 3 40.0
9 lisi 20180711 5 128.0
10 lisi 20180711 4 20.0
select * from user_access_log;
sid 2018-07-01 01:10:00 URL1
sid 2018-07-01 01:15:00 URL2
sid 2018-07-01 01:12:00 URL3
sid 2018-07-21 01:13:00 URL4
sid 2018-07-21 01:18:00 URL5
zhangsan 2018-07-01 01:20:00 URL1
zhangsan 2018-07-01 01:25:00 URL2
zhangsan 2018-07-21 01:22:00 URL3
lisi 2018-07-11 01:10:00 URL1
lisi 2018-07-11 01:10:40 URL2
1.Windowing functions
LEAD(col,n,DEFAULT)
统计窗口内分组排序后每组往下第n行某列的值。col为列名,n为往下第n行(可选,默认为1),DEFAULT为默认值(当往下第n行为NULL时候,取默认值,如不指定,则为NULL)
需求:用户访问日志表,统计每个用户访问每个URL停留时间
select user_name,
time stime,
lead(time) over(partition by user_name order by time) etime,
UNIX_TIMESTAMP(lead(time) over(partition by user_name order by time),'yyyy-MM-dd HH:mm:ss')- UNIX_TIMESTAMP(time,'yyyy-MM-dd HH:mm:ss') period,
url
from user_access_log;
stime,是开始访问URL的时间,etime是访问下一个url的时间。etime用over开窗口,窗口内以用户名分组按时间拍升序,所以lead(time) 就是该用户访问下一个URL的时间。
lisi 2018-07-11 01:10:00 2018-07-11 01:10:40 40 URL1
lisi 2018-07-11 01:10:40 NULL NULL URL2
sid 2018-07-01 01:10:00 2018-07-01 01:12:00 120 URL1
sid 2018-07-01 01:12:00 2018-07-01 01:15:00 180 URL3
sid 2018-07-01 01:15:00 2018-07-21 01:13:00 1727880 URL2
sid 2018-07-21 01:13:00 2018-07-21 01:18:00 300 URL4
sid 2018-07-21 01:18:00 NULL NULL URL5
zhangsan 2018-07-01 01:20:00 2018-07-01 01:25:00 300 URL1
zhangsan 2018-07-01 01:25:00 2018-07-21 01:22:00 1727820 URL2
zhangsan 2018-07-21 01:22:00 NULL NULL URL3
LAG(col,n,DEFAULT)
用于统计窗口内分组排序后每组当前记录往上第n行某列的值,col为列名,n为往上第n行(可选,默认为1),DEFAULT为默认值(当往上第n行为NULL时候,取默认值,如不指定,则为NULL)
select user_name,
time time,
LAG(time) over(partition by user_name order by time) last1_time,
LAG(time,2) over(partition by user_name order by time) last2_time
from user_access_log;
结果
lisi 2018-07-11 01:10:00 NULL NULL
lisi 2018-07-11 01:10:40 2018-07-11 01:10:00 NULL
sid 2018-07-01 01:10:00 NULL NULL
sid 2018-07-01 01:12:00 2018-07-01 01:10:00 NULL
sid 2018-07-01 01:15:00 2018-07-01 01:12:00 2018-07-01 01:10:00
sid 2018-07-21 01:13:00 2018-07-01 01:15:00 2018-07-01 01:12:00
sid 2018-07-21 01:18:00 2018-07-21 01:13:00 2018-07-01 01:15:00
zhangsan 2018-07-01 01:20:00 NULL NULL
zhangsan 2018-07-01 01:25:00 2018-07-01 01:20:00 NULL
zhangsan 2018-07-21 01:22:00 2018-07-01 01:25:00 2018-07-01 01:20:00
FIRST_VALUE
用于统计窗口内分组排序后每组排第一的行某列的值
LAST_VALUE
用于统计窗口内分组排序后每组截止到当前行排最后的行某列的值
select user_name,
time time,
first_value(time) over(partition by user_name order by time) firsttime,
last_value(time) over(partition by user_name order by time) last_time
from user_access_log;
结果
名字 访问时间 fisrt_value last_value
lisi 2018-07-11 01:10:00 2018-07-11 01:10:00 2018-07-11 01:10:00
lisi 2018-07-11 01:10:40 2018-07-11 01:10:00 2018-07-11 01:10:40
sid 2018-07-01 01:10:00 2018-07-01 01:10:00 2018-07-01 01:10:00
sid 2018-07-01 01:12:00 2018-07-01 01:10:00 2018-07-01 01:12:00
sid 2018-07-01 01:15:00 2018-07-01 01:10:00 2018-07-01 01:15:00
sid 2018-07-21 01:13:00 2018-07-01 01:10:00 2018-07-21 01:13:00
sid 2018-07-21 01:18:00 2018-07-01 01:10:00 2018-07-21 01:18:00
zhangsan 2018-07-01 01:20:00 2018-07-01 01:20:00 2018-07-01 01:20:00
zhangsan 2018-07-01 01:25:00 2018-07-01 01:20:00 2018-07-01 01:25:00
zhangsan 2018-07-21 01:22:00 2018-07-01 01:20:00 2018-07-21 01:22:00
2.The OVER clause
OVER with a PARTITION BY见上面的几个例子,这个功能和group by很像,但是不会按by后面的字段去重。
OVER with PARTITION BY and ORDER BY 分组内排序。
3.Windowing 子句
(ROWS | RANGE) BETWEEN (UNBOUNDED | [num]) PRECEDING AND ([num] PRECEDING | CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN CURRENT ROW AND (CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN [num] FOLLOWING AND (UNBOUNDED | [num]) FOLLOWING
如果只使用partition by子句,未指定order by的话,我们的聚合是分组内的聚合。
使用了order by子句,未使用window子句的情况下,默认聚合是从起点到当前行。
当同一个select查询中存在多个窗口函数时,他们相互之间是没有影响的.每个窗口函数应用自己的规则。
PRECEDING:往前
FOLLOWING:往后
CURRENT ROW:当前行
UNBOUNDED:起点
UNBOUNDED PRECEDING 表示从前面的起点
UNBOUNDED FOLLOWING:表示到后面的终点
select id,user_name,buy_date,price,
sum(price) over() as sample1,--所有行相加
sum(price) over(partition by user_name) as sample2,--按name分组,组内数据相加
sum(price) over(partition by user_name order by buy_date) as sample3,--按name分组,组内数据累加
sum(price) over(partition by user_name order by buy_date rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--和sample3一样,由起点到当前行的聚合
sum(price) over(partition by user_name order by buy_date rows between 1 PRECEDING and current row) as sample5, --当前行和前面一行做聚合
sum(price) over(partition by user_name order by buy_date rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行
sum(price) over(partition by user_name order by buy_date rows between current row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行
from buy_record;
id name buy_date s1 s2 s3 s4 s5 s6 s7 s8
10 lisi 20180711 20.0 780.0 148.0 148.0 20.0 20.0 148.0 148.0
9 lisi 20180711 128.0 780.0 148.0 148.0 148.0 148.0 148.0 128.0
3 sid 20180701 128.0 780.0 444.0 276.0 128.0 128.0 148.0 444.0
2 sid 20180701 20.0 780.0 444.0 276.0 148.0 148.0 276.0 316.0
1 sid 20180701 128.0 780.0 444.0 276.0 276.0 148.0 188.0 296.0
5 sid 20180721 40.0 780.0 444.0 444.0 316.0 168.0 296.0 168.0
4 sid 20180721 128.0 780.0 444.0 444.0 444.0 168.0 168.0 128.0
7 zhangsan 20180701 20.0 780.0 188.0 148.0 20.0 20.0 148.0 188.0
6 zhangsan 20180701 128.0 780.0 188.0 148.0 148.0 148.0 188.0 168.0
8 zhangsan 20180721 40.0 780.0 188.0 188.0 188.0 168.0 168.0 40.0
对比lag lead first_value last_value函数,这些只是取前后几行的列值,而window子查询可以对前后几行做聚合。
4.Analytics functions
ROW_NUMBER()从1开始,按照顺序,生成分组内记录的序列,row_number()的值不会存在重复,当排序的值相同时,按照表中记录的顺序进行排列
RANK() 生成数据项在分组中的排名,排名相等会在名次中留下空位
DENSE_RANK() 生成数据项在分组中的排名,排名相等会在名次中不会留下空位
SELECT
user_name,
buy_date,
price,
RANK() OVER(PARTITION BY user_name ORDER BY price DESC) AS rn1,
DENSE_RANK() OVER(PARTITION BY user_name ORDER BY price DESC) AS rn2,
PERCENT_RANK() OVER(PARTITION BY user_name ORDER BY price DESC) AS rn3,
ROW_NUMBER() OVER(PARTITION BY user_name ORDER BY price DESC) AS rn4
FROM buy_record;
name buy_date price rn1 rn2 rn3 rn4
lisi 20180711 128.0 1 1 0.0 1
lisi 20180711 20.0 2 2 1.0 2
sid 20180721 128.0 1 1 0.0 1
sid 20180701 128.0 1 1 0.0 2
sid 20180701 128.0 1 1 0.0 3
sid 20180721 40.0 4 2 0.75 4
sid 20180701 20.0 5 3 1.0 5
zhangsan 20180701 128.0 1 1 0.0 1
zhangsan 20180721 40.0 2 2 0.5 2
zhangsan 20180701 20.0 3 3 1.0 3
NTILE
把分组内数据切片,比如ntile(3)分组内数据切成3片。
NTILE不支持ROWS BETWEEN,
比如 NTILE(2) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW)
如果切片不均匀,默认增加第一个切片的分布。
select id,user_name,buy_date,price,
ntile(3) over() as sample1 , --全局数据切片
ntile(3) over(partition by user_name), -- 按照user_name进行分组,在分组内将数据切成3份
ntile(3) over(order by price),--全局按照price升序排列,数据切成3份
ntile(3) over(partition by user_name order by price) --按照name分组,在分组内按照cost升序排列,数据切成3份
from buy_record;
10 lisi 20180711 20.0 1 2 1 1
9 lisi 20180711 128.0 1 1 3 2
2 sid 20180701 20.0 3 1 1 1
5 sid 20180721 40.0 2 3 2 1
1 sid 20180701 128.0 3 1 3 2
3 sid 20180701 128.0 3 2 3 2
4 sid 20180721 128.0 2 2 2 3
7 zhangsan 20180701 20.0 1 2 1 1
8 zhangsan 20180721 40.0 1 3 1 2
6 zhangsan 20180701 128.0 2 1 2 3