hive sql练习

最新推荐文章于 2024-08-27 11:35:12 发布

原创最新推荐文章于 2024-08-27 11:35:12 发布 · 616 阅读

9 ·

CC 4.0 BY-SA版权

文章标签：

#hive #sql #hive练习

Hive 专栏收录该内容

5 篇文章

订阅专栏

具体练习数据可从Hive sql数据获取

#0.创建数据库
create database traindb;

use database triandb;

#1.建表 student表
create table student(sid varchar(10),sname varchar(10),sbirth date,ssex varchar(10))
partitioned by (sclass int)
row format delimited fields terminated by ','
lines terminated by '\n';

#1.1导入数据
#法1. load data local(默认为hdfs) inpath '数据地址' into(overwrite into) table 表名;
#into是追加到表尾 ，overwrite into是覆盖
#法2. 先用hadoop将数据传到hdfs表的存储地址下，然后修复表;
     hdfs hfs -put '本地地址' 'hdfs:///'
     msck repair table 表名;
load data local inpath '/home/sparknode/桌面/student1.txt' into table student partition(sclass=1);
load data local inpath '/home/sparknode/桌面/student2.txt' into table student partition(sclass=2);
load data local inpath '/home/sparknode/桌面/student3.txt' into table student partition(sclass=3);

#  表2 course表
create table course(cid varchar(10),cname varchar(10),tid varchar(10))
row format delimited fields terminated by ','
lines terminated by '\n';

load data local inpath '/home/sparknode/桌面/course.txt' into table course;
  
#  表3 teacher表
create table teacher(tid varchar(10),tname varchar(10))
row format delimited fields terminated by ','
lines terminated by '\n';
#直接用hadoop将数据put到hdfs上
hdfs dfs -put '/home/sparknode/桌面/teacher.txt' 'hdfs:///home/sparknode/hive/traindb.db/teacher/';
#此时只有hdfs中有数据，而mysql的元数据中并无该信息，因此在hive中修复一下表
msck repair table teacher;

#  表4 score表
create table score(sid varchar(10),cid varchar(10),score int)
row format delimited fields terminated by ','
lines terminated by '\n';

load data local inpath '/home/sparknode/桌面/score.txt' overwrite into table score;

hive 查询基础篇，该部分与mysql一致，可参考sql面试50题

#***********练习*****基础版*******************

#一、模糊查询
#1.查询 李姓 学生信息
select sid,sname,sbirth,ssex,sclass 
from student 
where sname like '李%';

#2.查询姓名中带有 四 的学生信息
select sid,sname,sbirth,ssex,sclass 
from student 
where sname like '%四';

#3.查询姓名第二位为 四 的学生信息
select sid,sname,sbirth,ssex,sclass 
from student 
where sname like '_四%';

#二、汇总分析
#1.查询课程编号为“2”的总成绩
select sum(score) as sumScore 
from score 
where cid='2';
#2.查询选了课程的学生人数
select count(distinct(sid)) from score;
#3.查询课程编号为“2”的最高分
select max(score) 
from score 
where cid = '2';

#三、分组
#1.查询各科成绩的最高及最低分
select cid,max(score) as maxScore,min(score) as minScore 
from score 
group by cid;

#2.查询每门课程被选修的学生数
select cid, count(1) as num from score group by cid;

#3.查询男、女生人数
select ssex,count(1) 
from student
group by ssex;


#四、分组+条件过滤
#1.查询平均成绩大于60分学生的学号和平均成绩
select sid,avg(score) as avgScore 
from score
group by sid
having avgScore > 60;

#2.查询选修三门课的学生学号
select sid 
from score
group by sid
having count(*) = 3;

#3.查询同名学生名单，并统计同名人数
select sname,count(*) as num
from student
group by sname
having num >1;

#四、复杂查询
#1.查询存在成绩小于60分的学生的学号、姓名
select sid,sname from student 
where sid in 
(select distinct(sid) from score where score<60);

select s.sid,s.sname 
from student s 
right join (select distinct(sid) from score where score<60) as tmp 
on s.sid = tmp.sid;

#2.查询所有成绩均小于60的学生的学号，姓名(所有课全不及格的学生)
select s.sid,s.sname from student s right join (
select sid from score
group by sid
having  sum(case when score>60 then 1 else 0 end) = 0 ) tmp
on s.sid =tmp.sid;

#3.查询没有学全所有课的学生的学号、姓名
select s.sid,s.sname from student s right join 
(select sid from score
group by sid
having count(*) < 5 ) as tmp
on s.sid = tmp.sid;

分组 top-N问题 ****重点

#查询各科成绩前两名的记录(top-N问题)

#情景0.仅能用于没有重复的情况
select t1.cid,t1.score from score t1 left join 
score t2 
on t1.cid = t2.cid
where t1.score <= t2.score
group by t1.cid,t1.score
having count(t1.score)<=2;

#情景1.有并列  相同分数算并列
select t1.cid,t1.score from score t1 left join 
score t2 
on t1.cid = t2.cid and t1.score < t2.score
group by t1.cid,t1.score
having count(t2.score) <2;

#用on t1.cid = t2.cid  然后通过where过滤 where t1.score < t2.score,会将最高分直接过滤没
#而on t1.cid = t2.cid and t1.score < t2.score ,最高分则会连接到Null,



#或使用dense_rank()、rank()、row_number()  + 窗口函数 over()
#1. dense_rank()  相当于并列

select cid,score,t.rank from 
(select cid,score,dense_rank() 
over(partition by cid order by score desc)  
as rank 
from score) t 
where t.rank <=2;


#结果
cid	score	t.rank
1	100	1
1	99	2
2	100	1
2	99	2
3	100	1
3	100	1
3	100	1
3	100	1
3	98	2
4	97	1
4	90	2
5	100	1
5	96	2

#2.rank()  也并列，但是并列的人占了不止一个名额

select cid,score,t.rank from 
(select cid,score,rank() 
over(partition by cid order by score desc)  
as rank 
from score) t 
where t.rank <=2;

#结果
cid	score	t.rank
1	100	1
1	99	2
2	100	1
2	99	2
3	100	1
3	100	1
3	100	1
3	100	1
4	97	1
4	90	2
5	100	1
5	96	2

#3.row_number()  无并列
select cid,score,t.rank from 
(select cid,score,row_number() 
over(partition by cid order by score desc)  
as rank 
from score) t 
where t.rank <=2;

#结果
cid	score	t.rank
1	100	1
1	99	2
2	100	1
2	99	2
3	100	1
3	100	2
4	97	1
4	90	2
5	100	1
5	96	2

行列互换问题

#1.某个字段的多行合并为该字段一行
#统计每个学生选择的课程，并存为一张表(每个学生占用一行)
create table elective_info(sid varchar(10),sname varchar(10),cname varchar(100));
insert into table elective_info 
select sid,sname,concat_ws("|",collect_set(cname)) as course from (select s.sid,st.sname,c.cname from score s left join 
course c on
s.cid = c.cid
left join 
student st 
on s.sid = st.sid) t
group by sid,sname;

concat(字段1,...,字段n)  可拼接多个字段（必须为string类型），也可拼接任何string类型
concat_ws(分隔符,字段1,...,字段n）

concat和concat_ws的区别：如用'|'为分隔符，拼接字段'a'、'b'、'c'，操作为
                        concat(a,'|',b,'|')
                        concat_ws('|',a,b,c)
                       且concat_ws可拼接array中的字段
collect_set(字段)，可将字段去重，然后将结果已array形式返回

结合collect_set+concat_ws+group by 就可完成按key将同key中其它字段合并为一行的操作

elective_info.sid	elective_info.sname	elective_info.cname
1	赵雷	语文|数学|英语|政治
10	李四	语文|数学|英语|政治
11	李四	语文
12	赵六	语文
13	孙七	数学
14	郑双	语文|数学|英语|政治|历史
15	王一	语文|数学|英语|政治|历史
16	冯二	语文|数学|英语|政治|历史
17	陈三	语文|数学|英语|政治|历史
2	钱电	语文|数学|英语|历史
3	孙风	语文|数学|英语|政治|历史
4	李云	语文|数学|英语|政治|历史
5	周梅	语文|数学|英语|历史
6	吴兰	语文|数学|历史
7	郑竹	语文|数学|英语|政治
8	梅梅	数学|英语|政治
9	张三	语文|数学|政治

#2.某个字段拆分为多行
#将上述表格还原为 类似
sid	sname	t.cname
1	赵雷	语文
1	赵雷	数学
1	赵雷	英语
1	赵雷	政治
10	李四	语文
10	李四	数学
10	李四	英语
10	李四	政治
11	李四	语文

select sid,sname, t.cname 
from elective_info 
lateral view explode(split(cname,"\\|")) t as cname;

explode(col) 其中col为array类型，可将每一行的array合并，并以列的形式返回
lateral view 为侧写

待续...