在Spark DataFrame中可以用Pivot函数将多行转成多列。
样本数据
每个用户对每部电影的评分。
数据格式:userID 用户ID,movieID 电影ID,rating评分
15,399,2
15,1401,5
15,1608,4
15,20,4
18,100,3
18,1401,3
18,399,1
Spark DataFrame 多行转多列
package com.bigData.spark
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
/**
* Author: Wang Pei
* License: Copyright(c) Pei.Wang
* Summary:
* 多行转多列
*
*/
object MultiLineToOneLine {
def main(args: Array[String]): Unit = {
/**设置日志*/
Logger.getLogger("org").setLevel(Level.WARN)
/**spark环境*/
val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[3]").getOrCreate()
import spark.implicits._
/**DataFrame 数据格式:每个用户对每部电影的评分 userID 用户ID,movieID 电影ID,rating评分 */
val df=spark.sparkContext.parallelize(Array(
(15,399,2),
(15,1401,5),
(15,1608,4),
(15,20,4),
(18,100,3),
(18,1401,3),
(18,399,1)
)).toDF("userID","movieID","rating")
/** pivot 多行转多列*/
val resultDF = df.groupBy($"userID").pivot("movieID").sum("rating").na.fill(-1)
/**结果*/
resultDF.show(false)
/*
+------+---+---+---+----+----+
|userID|20 |100|399|1401|1608|
+------+---+---+---+----+----+
|15 |4 |-1 |2 |5 |4 |
|18 |-1 |3 |1 |3 |-1 |
+------+---+---+---+----+----+
*/
}
}