import java.text.SimpleDateFormat
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
/**
* @Autho: Administrator and wind
* @Version: 2019/11/19 & 1.0
*
* SparkSQL相关案例
*
* 学号
* 姓名
* 性别
* 生日
* 所在班级
*/
case class Student(sno:String,sname:String,ssex:String,sbirthday:String,sclass:String)
//tno 是教工编号
case class Course(cno:String,cname:String,tno:String)
//sno学号 cno课程号 degree成绩
case class Score(sno:String,cno:String,degree:String)
//tprof职称 tdepart教工所在部门
case class Teacher(tno:String,tname:String,tsex:String,tbirthday:String,tprof:String,tdepart:String)
object SparkSQLExample {
def main(args: Array[String]): Unit = {
//Hadoop环境变量 运行不报错
//System.setProperty("hadoop.home.dir","E:\\hadoop-2.8.4")
//减少日志打印
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
/**
* 获取当前时间的函数
*/
def getDate(time:String) = {
val now : Long = System.currentTimeMillis()
val df : SimpleDateFormat = new SimpleDateFormat(time)
df.format(now)
}
//Spark环境
val spark = SparkSession.builder()
.master("local")
.appName("SparkSQLExample")
.getOrCreate()
//读取数据
import spark.sqlContext.implicits._
spark.sparkContext
.textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Student.csv")
.map(_.split(","))
.map(x => Student(x(0),x(1),x(2),x(3),x(4)) )
.toDF
.createOrReplaceTempView("Student")
spark.sparkContext
.textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Course.csv")
.map(_.split(","))
.map(x => Course(x(0),x(1),x(2)))
.toDF()
.createOrReplaceTempView("Course")
spark.sparkContext
.textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Score.csv")
.map(_.split(","))
.map(x => Score(x(0),x(1),x(2)))
.toDF()
.createOrReplaceTempView("Score")
spark.sparkContext
.textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Teacher.csv")
.map(_.split(","))
.map(x => Teacher(x(0),x(1),x(2),x(3),x(4),x(5)))
.toDF()
.createOrReplaceTempView("Teacher")
//spark.sql("select * from Teacher").show()
//查询Student表中所有记录 sanme ssex sclass 列
//spark.sql("select sname,ssex,sclass from Student").show()
//查询教师表中不重复depart列
//spark.sql("select distinct tdepart from Teacher").show(false)
//spark.sql("select tdepart from Teacher group by tdepart").show(false)
//查询score表中成绩在60 80 之间的所有记录
//spark.sql("select * from score where degree >=60 and degree <=80").show()
//spark.sql("select * from score where degree between 60 and 80").show()
//查询score表中成绩为85 86 或者是 88 的记录
//spark.sql("select * from score where degree = '85' or degree = '86' OR degree = '88'").show()
//以class降序升序查询
//spark.sql("select * from student order by sclass desc").show()
//spark.sql("select * from student order by sclass").show()
//以cno升序degree降序查询score表中数据
//spark.sql("select * from score t order by t.cno asc, t.degree desc").show()
//查询score表中的最高分的学生的学号和课程号
//spark.sql("select * from score order by Int(degree) desc limit 1").show()
//spark.sql("select * from score order by Int(degree) desc").show()
//查询每门课程的平均成绩 调用数值类型函数的会自动转化
//spark.sql("select cno, avg(degree) from score group by cno").show()
//查询score表中至少有5名学生选修的课,并且名字以3开头的课程 的平均分数
//spark.sql("select cno,avg(degree) from score where cno like '3%' group by cno having count(cno) >= 5").show()
//查询所有学生中的 sname cname degree
// spark.sql("select s.sname, c.cname, t.degree from score t " +
// "join student s on t.sno = s.sno " +
// "join course c on c.cno = t.cno").show(false)
//spark.sql("select s.sname, c.cname, t.degree from score t, student s, course c where t.sno = s.sno and c.cno = t.cno").show(false)
//查询score表中选择多门课程的同学,分数为非最高分成绩的记录
// spark.sql("select * from score where " +
// "sno in (select sno from score t group by t.sno having count(cno) > 1) " +
// "and degree != (select max(degree) from score)").show()
//查询和学号为108的同学同年出生的所有同学的sno sname sbirthday列
// spark.sql("select sno, sname, sbirthday from student where substring(sbirthday,0,4) = " +
// "(select substring(sbirthday,0,4) from student where sno = '108')").show()
//查询选修某课程的同学人数大于5名的教师姓名
// spark.sql("select tname from teacher t, " +
// "(select cno from score group by cno having count(cno) > 5) s, " +
// "course c where " +
// "t.tno = c.tno and c.cno = s.cno").show()
// spark.sql("select tname from teacher t " +
// "join course c on t.tno = c.tno " +
// "join (select cno from score group by cno having count(cno) > 5) s on s.cno = c.cno").show()
//查询成绩比该课程平均成绩低的同学的成绩表
//spark.sql("select * from score s where s.degree < (select avg(degree) from score c where c.cno = s.cno)").show()
//查询所有没有讲课的教师的 tname 和 depart
//spark.sql("select tname, tdepart from teacher t where t.tno not in (select tno from course c where c.cno in (select cno from score))").show(false)
//查询至少有两名男生的班号
//spark.sql("select sclass from student t where ssex = 'male' group by sclass having count(ssex) >= 2").show()
//查询student表中不姓 王 的同学的记录
//spark.sql("select * from student t where sname not like 'Wang%'").show()
//查询student 表中每个学生的姓名和年纪
//spark.sql("select sname, (cast(" + getDate("yyyy") + " as int) - cast(substring(sbirthday,0,4) as int)) as age from student s").show()
spark.close()
}
}