import org.apache.log4j.{Level, Logger}import org.apache.spark.sql.SparkSession
object TestSqlGroupByOrder {
def main(args: Array[String]): Unit ={/**设置日志等级*/
Logger.getLogger("org").setLevel(Level.WARN)/**从Spark 2.0开始,引入SparkSession。SparkSession=SQLContext+HiveContext*/
val sparkSession=SparkSession.builder().appName("SparkSqlGroup").master("local[6]").getOrCreate()/**DataFrame*/import sparkSession.implicits._
val scoreInfo = sparkSession.read.textFile("/Users/wangpei/Desktop/scores2.txt").map(_.split(",")).map(item=>(item(1),item(2).toInt,item(3).toInt,item(4).toInt,item(5),item(6))).toDF("studentId","language","math","english","classId","departmentId")/**注册DataFrame成一个零时视图*/
scoreInfo.createOrReplaceTempView("scoresTable")/**
* 使用开窗函数
* row_number() OVER (PARTITION BY COL1 ORDER BY COL2) rank
* 根据COL1分组,在分组内部根据COL2排序,rank:每组内部排序后的编号字段
* 这里用了两段SQl:
* 1)(SELECT *, row_number() OVER (PARTITION BY departmentId,classId ORDER BY math DESC) rank FROM scoresTable ) tmp
* 用开窗函数:按departmentId,classId分组;分组内部按math降序;每组序号rank从1开始;表别名tmp
* 2)SELECT * FROM tmp WHERE rank <= 3
* 保留rank <= 3的数据
*///语文前3println("############# 语文前3 ##############")
sparkSession.sql("SELECT departmentId,classId,language,studentId FROM (SELECT *, row_number() OVER (PARTITION BY departmentId,classId ORDER BY language DESC) rank FROM scoresTable ) tmp WHERE rank <= 3").show()//数学前3println("############# 数学前3 ##############")
sparkSession.sql("SELECT departmentId,classId,math,studentId FROM (SELECT *, row_number() OVER (PARTITION BY departmentId,classId ORDER BY math DESC) rank FROM scoresTable ) tmp WHERE rank <= 3").show()//外语前3println("############# 外语前3 ##############")
sparkSession.sql("SELECT departmentId,classId,english,studentId FROM (SELECT *, row_number() OVER (PARTITION BY departmentId,classId ORDER BY english DESC) rank FROM scoresTable ) tmp WHERE rank <= 3").show()}}