1.有header的txt文件创建DataFrame:
利用 mapPartitionsWithIndex
val teacherRdd = sc.textFile("src/test/teacher.txt")
val teacherRddSchema = teacherRdd.mapPartitionsWithIndex((idx, iter) => if (idx == 0) iter.drop(1) else iter).map(row => row.split(" ")).map(field => teacher(field(0).toInt,field(1),field(2)))
val teacherDF = teacherRddSchema.toDF()
2.利用csv创建dataFrame的时候,给定case class去定义schema
import org.apache.spark.sql.Encoders
case class student (id:Int, name:String, course:String,score:Int)
val schema = Encoders.product[student].schema
val studentDf = spark.read.format("CSV").option("header",true).schema(schema).load("src/test/student.csv").as[student]
studentDf.printSchema()
完整程序:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.apache.log4j._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Encoders
case class teacher (id:Int, name:String, course:String)
case class student (id:Int, name:String, course:String,score:Int)
case class result ( name:String, course:String, score:Int)
object test {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel({Level.ERROR})
val conf = new SparkConf().setAppName("test").setMaster("local")
val sc = new SparkContext(conf)
val spark = SparkSession.builder().appName("spark").getOrCreate()
import spark.implicits._
val schema = Encoders.product[student].schema
val studentDf = spark.read.format("CSV").option("header",true).schema(schema).load("src/test/student.csv").as[student]
studentDf.printSchema()
val teacherRdd = sc.textFile("src/test/teacher.txt")
val teacherRddSchema = teacherRdd.mapPartitionsWithIndex((idx, iter) => if (idx == 0) iter.drop(1) else iter).map(row => row.split(" ")).map(field => teacher(field(0).toInt,field(1),field(2)))
val teacherDF = teacherRddSchema.toDF()
teacherDF.printSchema()
teacherDF.show()
val rankSpec = Window.partitionBy("course").orderBy(studentDf("score").desc)
val rank = studentDf.withColumn("rank",dense_rank().over(rankSpec))
val result = rank.select("*").where($"rank" <= 1).drop("rank")
println("result:")
result.show()
/*
*
* result:
+---+------+--------+-----+
| id| name| course|score|
+---+------+--------+-----+
| 5|Kelvin| Math| 99|
| 2| Lucy| English| 88|
| 3| Sandy|Computer| 95|
| 7| Lucas| Physics| 99|
+---+------+--------+-----+
*
*
* */
println("result2")
val result2 = studentDf.groupBy("course").agg(Map("score" -> "max"))
result2.show()
/*
*
* result2
+--------+----------+
| course|max(score)|
+--------+----------+
| Math| 99|
| English| 88|
|Computer| 95|
| Physics| 99|
+--------+----------+
*
* */
result.as[result].foreach(row => println(s"${row.name} in ${row.course} get the hightest score ${row.score}"))
studentDf.show()
val result1 = studentDf.select("*").where($"score" === studentDf.agg(max("score")).first.get(0))
result1.show()
studentDf.groupBy("course").avg("score").show()
studentDf.groupBy("course").agg(avg("score")).show()
studentDf.groupBy("course").agg(Map("score"->"max")).show()
studentDf.groupBy("course").agg(Map("score"->"Min")).show()
studentDf.groupBy("course").agg(("score","min")).show()
studentDf.groupBy("course").agg(Map("score" -> "sum")).show()
}
}