from pyspark.sql import Window, SparkSession
import pyspark.sql.functions as F
data = spark.createDataFrame(...)# data.columns = ['A', 'B', 'C', 'D]# 对data按A作groupby分区,然后在每个分区内根据B做排序,并将当前行和前两行作为一个组合,计算每个组合内列C的总和和列D的平均值
win=Window.partitionBy("A").orderBy(data['B']).rowsBetween(-2, Window.currentRow)
data.withColumn("c_s", F.sum("C").over(win)).withCOlumn("d_mean", F.mean("D").over(win))
2. 根据滑窗找出上一行的值
from pyspark.sql import Window, SparkSession
import pyspark.sql.functions as F
data = spark.createDataFrame(...)# data.columns = ['A', 'B', 'C', 'D]# 将data按A进行groupby分组,对每组按B排序,然后在每个分区内找出当前行的上一行,形成一个新列
win = Window.partitionBy("A").orderBy(data['B'])
data.withColumn("C_lag", F.lag("C").over(win))
3. 排序,并将rank值作为一个新列
import pyspark.sql.functions as F
from pyspark.sql import Window
win = Window.orderBy(F.desc("A"))
data.withColumn("rank",F.dense_rank().over(win)).show()