具体练习数据可从Hive sql数据获取
#0.创建数据库
create database traindb;
use database triandb;
#1.建表 student表
create table student(sid varchar(10),sname varchar(10),sbirth date,ssex varchar(10))
partitioned by (sclass int)
row format delimited fields terminated by ','
lines terminated by '\n';
#1.1导入数据
#法1. load data local(默认为hdfs) inpath '数据地址' into(overwrite into) table 表名;
#into是追加到表尾 ,overwrite into是覆盖
#法2. 先用hadoop将数据传到hdfs表的存储地址下,然后修复表;
hdfs hfs -put '本地地址' 'hdfs:///'
msck repair table 表名;
load data local inpath '/home/sparknode/桌面/student1.txt' into table student partition(sclass=1);
load data local inpath '/home/sparknode/桌面/student2.txt' into table student partition(sclass=2);
load data local inpath '/home/sparknode/桌面/student3.txt' into table student partition(sclass=3);
# 表2 course表
create table course(cid varchar(10),cname varchar(10),tid varchar(10))
row format delimited fields terminated by ','
lines terminated by '\n';
load data local inpath '/home/sparknode/桌面/course.txt' into table course;
# 表3 teacher表
create table teacher(tid varchar(10),tname varchar(10))
row format delimited fields terminated by ','
lines terminated by '\n';
#直接用hadoop将数据put到hdfs上
hdfs dfs -put '/home/sparknode/桌面/teacher.txt' 'hdfs:///home/sparknode/hive/traindb.db/teacher/';
#此时只有hdfs中有数据,而mysql的元数据中并无该信息,因此在hive中修复一下表
msck repair table teacher;
# 表4 score表
create table score(sid varchar(10),cid varchar(10),score int)
row format delimited fields terminated by ','
lines terminated by '\n';
load data local inpath '/home/sparknode/桌面/score.txt' overwrite into table score;
hive 查询 基础篇,该部分与mysql一致,可参考sql面试50题
#***********练习*****基础版*******************
#一、模糊查询
#1.查询 李姓 学生信息
select sid,sname,sbirth,ssex,sclass
from student
where sname like '李%';
#2.查询姓名中带有 四 的学生信息
select sid,sname,sbirth,ssex,sclass
from student
where sname like '%四';
#3.查询姓名第二位为 四 的学生信息
select sid,sname,sbirth,ssex,sclass
from student
where sname like '_四%';
#二、汇总分析
#1.查询课程编号为“2”的总成绩
select sum(score) as sumScore
from score
where cid='2';
#2.查询选了课程的学生人数
select count(distinct(sid)) from score;
#3.查询课程编号为“2”的最高分
select max(score)
from score
where cid = '2';
#三、分组
#1.查询各科成绩的最高及最低分
select cid,max(score) as maxScore,min(score) as minScore
from score
group by cid;
#2.查询每门课程被选修的学生数
select cid, count(1) as num from score group by cid;
#3.查询男、女生人数
select ssex,count(1)
from student
group by ssex;
#四、分组+条件过滤
#1.查询平均成绩大于60分学生的学号和平均成绩
select sid,avg(score) as avgScore
from score
group by sid
having avgScore > 60;
#2.查询选修三门课的学生学号
select sid
from score
group by sid
having count(*) = 3;
#3.查询同名学生名单,并统计同名人数
select sname,count(*) as num
from student
group by sname
having num >1;
#四、复杂查询
#1.查询存在成绩小于60分的学生的学号、姓名
select sid,sname from student
where sid in
(select distinct(sid) from score where score<60);
select s.sid,s.sname
from student s
right join (select distinct(sid) from score where score<60) as tmp
on s.sid = tmp.sid;
#2.查询所有成绩均小于60的学生的学号,姓名(所有课全不及格的学生)
select s.sid,s.sname from student s right join (
select sid from score
group by sid
having sum(case when score>60 then 1 else 0 end) = 0 ) tmp
on s.sid =tmp.sid;
#3.查询没有学全所有课的学生的学号、姓名
select s.sid,s.sname from student s right join
(select sid from score
group by sid
having count(*) < 5 ) as tmp
on s.sid = tmp.sid;
分组 top-N问题 ****重点
#查询各科成绩前两名的记录(top-N问题)
#情景0.仅能用于没有重复的情况
select t1.cid,t1.score from score t1 left join
score t2
on t1.cid = t2.cid
where t1.score <= t2.score
group by t1.cid,t1.score
having count(t1.score)<=2;
#情景1.有并列 相同分数算并列
select t1.cid,t1.score from score t1 left join
score t2
on t1.cid = t2.cid and t1.score < t2.score
group by t1.cid,t1.score
having count(t2.score) <2;
#用on t1.cid = t2.cid 然后通过where过滤 where t1.score < t2.score,会将最高分直接过滤没
#而on t1.cid = t2.cid and t1.score < t2.score ,最高分则会连接到Null,
#或使用dense_rank()、rank()、row_number() + 窗口函数 over()
#1. dense_rank() 相当于并列
select cid,score,t.rank from
(select cid,score,dense_rank()
over(partition by cid order by score desc)
as rank
from score) t
where t.rank <=2;
#结果
cid score t.rank
1 100 1
1 99 2
2 100 1
2 99 2
3 100 1
3 100 1
3 100 1
3 100 1
3 98 2
4 97 1
4 90 2
5 100 1
5 96 2
#2.rank() 也并列,但是并列的人占了不止一个名额
select cid,score,t.rank from
(select cid,score,rank()
over(partition by cid order by score desc)
as rank
from score) t
where t.rank <=2;
#结果
cid score t.rank
1 100 1
1 99 2
2 100 1
2 99 2
3 100 1
3 100 1
3 100 1
3 100 1
4 97 1
4 90 2
5 100 1
5 96 2
#3.row_number() 无并列
select cid,score,t.rank from
(select cid,score,row_number()
over(partition by cid order by score desc)
as rank
from score) t
where t.rank <=2;
#结果
cid score t.rank
1 100 1
1 99 2
2 100 1
2 99 2
3 100 1
3 100 2
4 97 1
4 90 2
5 100 1
5 96 2
行列互换问题
#1.某个字段的多行合并为该字段一行
#统计每个学生选择的课程,并存为一张表(每个学生占用一行)
create table elective_info(sid varchar(10),sname varchar(10),cname varchar(100));
insert into table elective_info
select sid,sname,concat_ws("|",collect_set(cname)) as course from (select s.sid,st.sname,c.cname from score s left join
course c on
s.cid = c.cid
left join
student st
on s.sid = st.sid) t
group by sid,sname;
concat(字段1,...,字段n) 可拼接多个字段(必须为string类型),也可拼接任何string类型
concat_ws(分隔符,字段1,...,字段n)
concat和concat_ws的区别:如用'|'为分隔符,拼接字段'a'、'b'、'c',操作为
concat(a,'|',b,'|')
concat_ws('|',a,b,c)
且concat_ws可拼接array中的字段
collect_set(字段),可将字段去重,然后将结果已array形式返回
结合collect_set+concat_ws+group by 就可完成按key将同key中其它字段合并为一行的操作
elective_info.sid elective_info.sname elective_info.cname
1 赵雷 语文|数学|英语|政治
10 李四 语文|数学|英语|政治
11 李四 语文
12 赵六 语文
13 孙七 数学
14 郑双 语文|数学|英语|政治|历史
15 王一 语文|数学|英语|政治|历史
16 冯二 语文|数学|英语|政治|历史
17 陈三 语文|数学|英语|政治|历史
2 钱电 语文|数学|英语|历史
3 孙风 语文|数学|英语|政治|历史
4 李云 语文|数学|英语|政治|历史
5 周梅 语文|数学|英语|历史
6 吴兰 语文|数学|历史
7 郑竹 语文|数学|英语|政治
8 梅梅 数学|英语|政治
9 张三 语文|数学|政治
#2.某个字段拆分为多行
#将上述表格还原为 类似
sid sname t.cname
1 赵雷 语文
1 赵雷 数学
1 赵雷 英语
1 赵雷 政治
10 李四 语文
10 李四 数学
10 李四 英语
10 李四 政治
11 李四 语文
select sid,sname, t.cname
from elective_info
lateral view explode(split(cname,"\\|")) t as cname;
explode(col) 其中col为array类型,可将每一行的array合并,并以列的形式返回
lateral view 为侧写
待续...