SQL经典案例之窗口函数的计算使用
构造测试数据
-- drop table IF EXISTS t_windows;
drop table t_windows;
create table t_windows(tid int,tname varchar(10),tnum int);
insert into t_windows values(1,'ORACLE',72);
insert into t_windows values(2,'MYSQL',78);
insert into t_windows values(3,'POSTGRESQL',85);
insert into t_windows values(4,'GAUSSDB',85);
insert into t_windows values(5,'SQLSERVER',88);
insert into t_windows values(6,'DB2',92);
insert into t_windows values(7,'TIDB',95);
insert into t_windows values(8,'MONGODB',95);
insert into t_windows values(9,'GODB',100);
commit;
计算累计值
窗口函数 SUM OVER 能轻松地生成数据累计(移动)的总计
select tid,tname,tnum,
sum(tnum) over(order by tnum) as num_total1,
sum(tnum) over(order by tnum,tname) as num_total2
from t_windows
order by tnum;
当排序列 tnum 存在重复数据时会出现重复值提前相加问题,对此应保证窗口函数排序的值是唯一的

计算众数
众数:数据集中出现频率(次数)最高的那个数据(元素),可存在多个
dense_rank
基础通用写法
select tnum
from (
select tnum,dense_rank() over(order by cnt desc) as rnk
from (select tnum,count(*) as cnt from t_windows group by tnum) as x
) as y
where rnk = 1;

CTE
-- 当存在多个重复的众数时用limit无法确定提取个数
select tnum,count(*) from t_windows group by 1 order by 2 desc limit 1;
-- 按次数再按数值大小降序排序
select tnum,count(*) from t_windows group by 1 order by 2 desc,1 desc limit 1;
-- CTE 写法
with x as(
select tnum,count(*) as num from t_windows group by tnum),
y as(
select max(num) as tmax from x)
select tnum from y left join x on x.num = y.tmax;

keep
Oracle 实现的小细节
-- Oracle 子查询表不能用 as 显示指定为别名,group by 后面要写明具体字段名不能用数字代替
select tnum
from (
select tnum,dense_rank() over(order by cnt desc) rnk
from (select tnum,count(*) as cnt from t_windows group by tnum) x
) y
where rnk = 1;
-- 使用聚合函数MAX的KEEP扩展来查找众数只返回出现次数最多的最大值,等同于分组排序后的limit 1
select max(tnum) keep(dense_rank first order by cnt desc) tnum_max
from (
select tnum,count(*) as cnt from t_windows group by tnum
) ;
-- rownum 实现 limit 1
select tnum,count(*) from t_windows group by tnum order by count(*) desc,tnum desc;
select tnum from (select tnum,count(*) from t_windows group by tnum order by count(*) desc,tnum desc) x where rownum = 1;

计算中位数
中位数(中值):顺序排列的一组数据中居于中间位置的数,偶数个则为中间两位和的平均值
percentile_cont
-- postgresql/oracle
select percentile_cont(0.5) within group(order by tnum) from t_windows;
-- oracle的MEDIAN函数也能实现
select median(tnum) from t_windows;
-- sqlserver
select percentile_cont(0.5) within group(order by tnum) over() from t_windows;
percent_rank
mysql没有percentile_cont函数,可采用权变结合percent_rank和CTE来实现,找出百分位数不超过50的最大值和百分位数不低于50的最小值,再计算其平均值
- percent_rank 窗口函数无法放置于where后面,故采用CTE写法
- order by 排序处无法使用 union ,故多写了两个CTE
with rank_tab(tnum,rank_sal) as(
select tnum,percent_rank() over(order by tnum) as rank_sal from t_windows
),
inter_asc as(
select tnum,rank_sal from rank_tab where rank_sal>=0.5 order by tnum limit 1
),
inter_desc as(
select tnum,rank_sal from rank_tab where rank_sal<=0.5 order by tnum desc limit 1
),
inter as(
select tnum from inter_asc
union
select tnum from inter_desc
)select avg(tnum) as MedianSal from inter;

计算截尾均值
在计算平均值时,为了降低两端最值对整体均值的影响,会考虑剔除最大值和最小值后再统计平均值
基本通用写法,使用子查询和聚合函数将最大值和最小值排除在外:
select avg(tnum) from t_windows
where tnum not in (
(select min(tnum) from t_windows),
(select max(tnum) from t_windows)
);
使用窗口函数将最大值和最小值排除在外:
select avg(tnum) from(
select tnum,min(tnum) over() min_num,
max(tnum) over() max_num
from t_windows
) x
where tnum not in (min_num,max_num);

注意: 以上两种方式当存在多个相同的最大值或最小值时都会被排除,其次则是注意不同数据库返回的数据精度也有差别
-- 即多插入一条最大值计算的平均值还是一样的
insert into t_windows values(10,'GODB',100);
delete from t_windows where tid=10;
如果只是要排除单个最高值和最低值,可以从SUM聚合结果中减去它们再执行除法计算
select (sum(tnum)-min(tnum)-max(tnum))/(count(*)-2) from t_windows;

拓展
窗口排序函数
- rank:跳跃排序,1、2、2、4、5 . . .
- dense_rank:重复无间断排序,1、2、2、3、4 …
- row_number:递增排序,1、2、3、4、5 …
-- dense_rank
select tnum,dense_rank() over(order by cnt desc) as rnk
from (select tnum,count(*) as cnt from t_windows group by 1 ) as x;
-- rank
select tnum,rank() over(order by cnt desc) as rnk
from (select tnum,count(*) as cnt from t_windows group by 1 ) as x;
-- row_number
select tnum,row_number() over(order by cnt desc) as rnk
from (select tnum,count(*) as cnt from t_windows group by 1 ) as x;

窗口偏移函数
- LAG(字段,偏移量,无值时返回):向下偏移,获取上一条记录
- LEAD(字段,偏移量,无值时返回):向上偏移,获取下一条记录
- FIRST_VALUE:分区内排序后,获取第一个值和最后一个值
- LAST_VALUE:分区内排序后,获取最后一个值
insert into t_windows values(10,'GODB',200);
select *,
LAG(tnum) over(order by tnum desc) as lag_num,
LEAD(tnum) over(order by tnum desc) as lead_num,
LAG(tnum,2,0) over(order by tnum desc) as lag_num_tab,
LEAD(tnum,2,0) over(order by tnum desc) as lead_num_tab,
FIRST_VALUE (tnum) OVER(PARTITION BY tname ORDER BY tnum desc) AS FirstValue,
LAST_VALUE (tnum) OVER(PARTITION BY tname ORDER BY tnum desc ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS LastValue
from t_windows;

窗口统计函数
- cume_dist():0< cume_dist <=1,计算公式 = ((统计的列的种类)/(统计的列的总数))* 每个重复种类中的个数
- percent_rank():0<= percent_rank <=1,计算公式 = ((统计的列的种类)/(统计的列的总数))*100%,第一个总是从 0 开始的,重复几次那就会有几个 0,最后一个是 1
SELECT tid,tname,tnum,
cume_dist() OVER (ORDER BY tnum) as cume_dist,
percent_rank() over (order by tnum) as percent_rank
FROM t_windows;


670

被折叠的 条评论
为什么被折叠?



