1. 准备数据:
score.csv数据如下:
01,01,80
01,02,90
01,03,99
02,01,70
02,02,60
02,03,80
03,01,80
03,02,80
03,03,80
04,01,50
04,02,30
04,03,20
05,01,76
05,02,87
06,01,31
06,03,34
07,02,89
07,03,98
student.csv 数据如下:
01,赵雷,1990-01-01,男
02,钱电,1990-12-21,男
03,孙风,1990-05-20,男
04,李云,1990-08-06,男
05,周梅,1991-12-01,女
06,吴兰,1992-03-01,女
07,郑竹,1989-07-01,女
08,王菊,1990-01-20,女
teacher.csv 数据如下:
01,张三
02,李四
03,王五
course.csv 数据如下:
01,语文,02
02,数学,01
03,英语,03
2.将数据导入hive中:
package com.spark.self
import org.apache.spark.sql.SparkSession
object buildRegionIntoHive {
def main(args: Array[String]): Unit = {
val warehouseLocation = "spark-warehouse路劲"
val spark = SparkSession
.builder()
.appName("Spark Hive Example")
.config("spark.sql.warehouse.dir", warehouseLocation)
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
import spark.sql
sql("drop table student")
sql("create table if not exists student(s_id string,s_name string,s_birth string,s_sex string)row format delimited fields terminated by ','")
sql("load data local inpath 'in/student.csv' into table student")
sql("SELECT * FROM student").show()
sql("drop table course")
sql("create table if not exists course(c_id string,c_name string,t_id string)row format delimited fields terminated by ','")
sql("load data local inpath 'in/course.csv' into table course")
sql("SELECT * FROM course").show()
sql("drop table teacher")
sql("create table if not exists teacher(t_id string,t_name string)row format delimited fields terminated by ','")
sql("load data local inpath 'in/teacher.csv' into table teacher")
sql("SELECT * FROM teacher").show()
sql("drop table score")
sql("create table if not exists score(s_id string,c_id string,s_score int)row format delimited fields terminated by ','")
sql("load data local inpath 'in/score.csv' into table score")
sql("SELECT * FROM score").show()
}
}
3.进行hive sql的编写:代码如下:
package com.spark.self
import org.apache.spark.sql.SparkSession
/**
* student : s_id ,s_name ,s_birth ,s_sex
* course : c_id ,c_name ,t_id
* teacher : t_id ,t_name
* score : s_id ,c_id ,s_score
*/
object HiveSqlTest {
def main(args: Array[String]): Unit = {
val warehouseLocation = "spark-warehouse路劲"
val spark = SparkSession
.builder()
.appName("Spark Hive Example")
.config("spark.sql.warehouse.dir", warehouseLocation)
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
import spark.sql
//查询"01"课程比"02"课程成绩高的学生的信息及课程分数
sql("select student.*,a.s_score as score01,b.s_score as score02 from student " +
"join score a on student.s_id = a.s_id and a.c_id='01' " +
"join score b on student.s_id = b.s_id and b.c_id='02' " +
"where a.s_score>b.s_score").show()
//查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩
sql("select student.s_id,student.s_name,score_avg.avgScore from student " +
"join (select s_id,avg(s_score) as avgScore from score group by s_id) as score_avg " +
"on score_avg.s_id = student.s_id " +
"where score_avg.avgScore>=60").show()
//查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩
sql("select student.s_id,student.s_name,score1.count,score1.sum from student " +
"join (select s_id,count(s_id) as count,sum(s_score) as sum from score group by s_id) as score1 " +
"where score1.s_id=student.s_id").show()
//查询"李"姓老师的数量
// sql("select count(*) from teacher where t_name like '李%'").show()
//查询学过"张三"老师授课的同学的信息
sql("select student.* from student " +
"join score on student.s_id =score.s_id " +
"join course on course.c_id=score.c_id " +
"join teacher on course.t_id=teacher.t_id and t_name='张三'").show()
//查询没学过"张三"老师授课的同学的信息
sql("select student.* from student " +
"left join (select s_id from score " +
"join course on course.c_id=score.c_id " +
"join teacher on course.t_id=teacher.t_id and t_name='张三')tmp " +
"on student.s_id =tmp.s_id " +
"where tmp.s_id is null").show()
//第一步:得到所有学过张三老师课的同学
sql("select score.s_id,score.c_id,c.c_name,c.t_id,t.t_name from score " +
"join course c on c.c_id = score.c_id " +
"join teacher t on t.t_id = c.t_id and t_name='张三'").show()
//得到结果如下:
/**
* +----+----+------+----+------+
* |s_id|c_id|c_name|t_id|t_name|
* +----+----+------+----+------+
* | 01| 02| 数学| 01| 张三|
* | 02| 02| 数学| 01| 张三|
* | 03| 02| 数学| 01| 张三|
* | 04| 02| 数学| 01| 张三|
* | 05| 02| 数学| 01| 张三|
* | 07| 02| 数学| 01| 张三|
* +----+----+------+----+------+
*/
//第二步:使用学生表与 第一步查询出的结果做left join
sql("select student.*,tmp.* from student " +
"left join (select score.s_id,score.c_id,c.c_name,c.t_id,t.t_name from score " +
"join course c on c.c_id = score.c_id " +
"join teacher t on t.t_id = c.t_id and t_name='张三') tmp " +
"on tmp.s_id = student.s_id").show()
//得到结果如下:
/**
* +----+------+----------+-----+----+----+------+----+------+
* |s_id|s_name| s_birth|s_sex|s_id|c_id|c_name|t_id|t_name|
* +----+------+----------+-----+----+----+------+----+------+
* | 01| 赵雷|1990-01-01| 男| 01| 02| 数学| 01| 张三|
* | 02| 钱电|1990-12-21| 男| 02| 02| 数学| 01| 张三|
* | 03| 孙风|1990-05-20| 男| 03| 02| 数学| 01| 张三|
* | 04| 李云|1990-08-06| 男| 04| 02| 数学| 01| 张三|
* | 05| 周梅|1991-12-01| 女| 05| 02| 数学| 01| 张三|
* | 06| 吴兰|1992-03-01| 女|null|null| null|null| null|
* | 07| 郑竹|1989-07-01| 女| 07| 02| 数学| 01| 张三|
* | 08| 王菊|1990-01-20| 女|null|null| null|null| null|
* +----+------+----------+-----+----+----+------+----+------+
*/
//第三步;
sql("select student.* from student " +
"left join (select score.s_id,score.c_id,c.c_name,c.t_id,t.t_name from score " +
"join course c on c.c_id = score.c_id " +
"join teacher t on t.t_id = c.t_id and t_name='张三') tmp " +
"on tmp.s_id = student.s_id " +
"where tmp.s_id is null").show()
//得到结果如下:
/**
* +----+------+----------+-----+
* |s_id|s_name| s_birth|s_sex|
* +----+------+----------+-----+
* | 06| 吴兰|1992-03-01| 女|
* | 08| 王菊|1990-01-20| 女|
* +----+------+----------+-----+
*/
//查询学过编号为"01"并且也学过编号为"02"的课程的同学的信息
//方法一:同时学过01和02的课程找出来,计算数量
sql("select student.*,tmp.* from student " +
"left join (select s_id,count(*) as number from score where c_id='01' or c_id='02' group by s_id) tmp " +
"on student.s_id=tmp.s_id " +
"where tmp.number=2").show()
//方法二:分别查出01课程和02课程然后做join
sql(" select student.* from student " +
"join (select s_id from score where c_id ='01' )tmp1 " +
"on student.s_id=tmp1.s_id " +
"join (select s_id from score where c_id ='02' )tmp2 " +
"on student.s_id=tmp2.s_id"
).show()
//查询学过编号为"01"但是没有学过编号为"02"的课程的同学的信息:
sql("select * from score where c_id='01'").show()
sql("select * from score where c_id='02'").show()
sql("select student.*,tmp1.c_id as c_id_01 ,tmp2.c_id as c_id_02 from student " +
"left join (select * from score where c_id='01') tmp1 " +
"on student.s_id=tmp1.s_id " +
"left join (select * from score where c_id='02') tmp2 " +
"on student.s_id=tmp2.s_id " +
"where tmp1.c_id='01' and tmp2.c_id is null")
.show()
//查询没有学全所有课程的同学的信息
//1. 先查询有多少们课程
sql("select count(*) from course").show()
//2. 查询课程数量不等于课程总数的学生
sql("select s_id,count(*) from score group by s_id having count(c_id)=(select count(*) from course)").show()
//3.查询学生信息
sql("select student.* from student " +
"join (select s_id,count(*) from score group by s_id having count(c_id)=(select count(*) from course)) tmp " +
"on tmp.s_id=student.s_id").show()
//查询和"01"号的同学学习的课程完全相同的其他同学的信息 concat_ws
sql("select concat_ws('|',collect_set(c_id)) as c_id_con from score where s_id='01'").show()
sql("select s_id,concat_ws('|',collect_set(c_id)) as c_id_con from score group by s_id having c_id_con=(select concat_ws('|',collect_set(c_id)) as c_id_con from score where s_id='01')").show()
sql("select student.* from student " +
"join (select s_id,concat_ws('|',collect_set(c_id)) as c_id_con from score group by s_id having c_id_con=(select concat_ws('|',collect_set(c_id)) as c_id_con from score where s_id='01')) tmp " +
"on tmp.s_id=student.s_id").show()
// 按平均成绩从高到低显示所有学生的所有课程的成绩以及平均成绩:
sql("select a.s_id,tmp1.s_score as chinese,tmp2.s_score as math,tmp3.s_score as english,round(avg (a.s_score),2) as avgScore from score a " +
"left join (select s_id,s_score from score s1 where c_id='01')tmp1 on tmp1.s_id=a.s_id " +
"left join (select s_id,s_score from score s2 where c_id='02')tmp2 on tmp2.s_id=a.s_id " +
"left join (select s_id,s_score from score s3 where c_id='03')tmp3 on tmp3.s_id=a.s_id " +
"group by a.s_id,tmp1.s_score,tmp2.s_score,tmp3.s_score order by avgScore desc").show()
//查询本月过生日得学生
sql("select * from student where MONTH(CURRENT_DATE)= MONTH(s_birth)").show()
//查询12月份过生日得学生 substring(),截取字符串,(第一个参数:需要截取得字段,从第几位开始截取,截取几位)
sql("select s_name,s_sex,s_birth from student where substring(s_birth,6,2)='12'").show()
sql("select student.*,substring(s_birth,1,4) as birthday from student where substring(s_birth,6,2)='12'").show()
}
}