hive的执行模式
1、 客户端(也就是黑窗口)
2、 hive -e ‘hql’
3、 hive -f sql.hql(指定一个脚本执行)
hive -S -f sql.hql 没有提示信息 (静默模式)
hive-help (linux下的命令,查看hive的帮助文档)
-- linux下的命令,查看hive的使用帮助
hive -help
usage: hive
-d,--define <key=value> Variable subsitution to apply to hive
commands. e.g. -d A=B or --define A=B
--database <databasename> Specify the database to use
-e <quoted-query-string> SQL from command line
-f <filename> SQL from files
-H,--help Print help information
--hiveconf <property=value> Use value for given property
--hivevar <key=value> Variable subsitution to apply to hive
commands. e.g. --hivevar A=B
-i <filename> Initialization SQL file
-S,--silent Silent mode in interactive shell
-v,--verbose Verbose mode (echo executed SQL to the
console)
查询语法
语句执行顺序
查询规则
-- 规则
-- 尽量不使用子查询、不使用in not in exists not exists
SELECT
*
from t1
where id in (select id from t2 where .... )
;
SELECT
*
from t1
where exists (select 1 as column_1 from t2 where t2.id = t1.id )
;
-- 查询尽量避免join,但是无可避免
-- 中间表
-- 查询永远都是小表驱动大表
--- 学生选修了所有课程
123
course
score
-- 学生在成绩表里存在每一门课程的成绩
exists
select
*
from stu
where exists (select 1 from course left join score on cid = cid and sid = sid )
select id from score where sid =''
;
join
-- JOIN
-- 实体和关系
-- 连接JOIN
-- 表的连接分为:
-- 内连接 INNER JOIN
-- 外连接 OUTER JOIN
-- 左连接 LEFT JOIN
-- 右连接 RIGHT JOIN
-- 全连接 FULL JOIN
-- 笛卡尔积 CROSS JOIN
-- 特殊类型
-- LEFT SEMI JOIN
-- JOIN的类型
-- LEFT JOIN \LEFT OUTER JOIN \ RIGHT JOIN \ RIGHT OUTER JOIN \ INNER JOIN \ FULL OUTER JOIN
-- 测试数据
a表
1,a1
2,a2
3,a3
4,a4
5,a5
6,a6
7,a7
8,a8
b表
2,liushuai
4,gaoyuanyuan
5,jiajingwen
7,gaogao
9,hetu
CREATE TABLE IF NOT EXISTS sz2002.u1(
id int,
name string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
;
CREATE TABLE IF NOT EXISTS sz2002.u2(
id int,
name string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
;
-- 加载数据
LOAD DATA LOCAL INPATH '/hivedata/u1.dat' INTO TABLE sz2002.u1;
LOAD DATA LOCAL INPATH '/hivedata/u2.dat' INTO TABLE sz2002.u2;
SELECT
*
FROM sz2002.u2
;
-- 内连接
-- INNER JOIN
-- ,
-- JOIN
-- 以上3种格式,结果都差不多,必须相互间能连接上才能返回结果
set hive.exec.mode.local.auto=true;
SELECT
*
FROM sz2002.u1
INNER JOIN sz2002.u2
ON u1.id = u2.id
;
SELECT
*
FROM sz2002.u1,sz2002.u2
WHERE u1.id = u2.id
;
-- JOIN 不加任何ON条件或者WHERE条件时,称之为笛卡尔积
-- LEFT JOIN \ LEFT OUTER JOIN
-- 数据以左表为基准,左表存在的数据都能查询出,右表的数据如果能连接上就能查询出,如果连接不上则以NULL来代替
SELECT
*
FROM sz2002.u1
LEFT OUTER JOIN sz2002.u2
ON u1.id = u2.id
;
-- 场景:A表有结果,但是B表为Null
SELECT
*
FROM sz2002.u1
LEFT OUTER JOIN sz2002.u2
ON u1.id = u2.id
WHERE u2.id IS NULL
;
SELECT
*
FROM sz2002.u1
WHERE NOT EXISTS (SELECT 1 FROM sz2002.u2 WHERE u2.id =u1.id)
;
-- RIGHT JOIN \ RIGHT OUTER JOIN
-- 以右表为准,右表的结果全都能查询出,然后匹配左表的信息,如果左表能匹配上,则返回左表的信息,否则以NULL来代替左表
SELECT
u1.*,u2.*
FROM sz2002.u1
RIGHT OUTER JOIN sz2002.u2
ON u1.id = u2.id
;
SELECT
u1.*,u2.*
FROM sz2002.u2
LEFT OUTER JOIN sz2002.u1
ON u1.id = u2.id
;
-- FULL OUTER JOIN
-- 相互进行连接,左表累加右表,如果某一张表上的数据连接不上,则使用NULL来代替
SELECT
u1.*,u2.*
FROM sz2002.u2
FULL OUTER JOIN sz2002.u1
ON u1.id = u2.id
;
-- LEFT SEMI JOIN
-- semi jojn 也叫半开连接,通常是left join的一种优化,只能查询出左表的信息,主要用来判断左表是否存在
SELECT
*
FROM sz2002.u1
LEFT SEMI JOIN sz2002.u2
ON u1.id = u2.id
;
SELECT
u1.*
FROM sz2002.u1
INNER JOIN sz2002.u2
ON u1.id = u2.id
;
-- exists
SELECT
*
FROM sz2002.u1
WHERE EXISTS (SELECT 1 FROM sz2002.u2 WHERE U2.ID = U1.ID)
;
-- hive提供小表标识:使用STREAMTABLE(小表的别名)
SELECT
/*+STREAMTABLE(d)*/
*
FROM sz2002.u1 e
INNER JOIN sz2002.u2 d
ON e.id = d.id
;
-- 子查询:hive对子查询支持不是很友好,特别是=问题很多
SELECT
*
FROM sz2002.u1
WHERE id = (SELECT id FROM sz2002.u2 WHERE id = 3)
;
-- INNER JOIN 和 OUTER JOIN的区别
-- 分区字段对OUTER JOIN 的ON条件无效,对INNER JOIN 中的ON条件有效
-- 有INNER JOIN 但是没有FULL INNER JOIN
-- 有FULL OUTER JOIN 但是没有OUTER JOIN
-- 所有的JOIN连接,都只支持等值连接(=和and) 不支持!= > < ... or
-- map-side join
-- 会将小表缓存到内存中,然后在map端就进行关联查找。hive在map查找时会减少查询量,减少大量的shuffle过程和数据传输的时间
set hive.auto.convert.join = true;
-- 以前的版本,可以添加/*+MAPJOIN(小表别名)*/来标识map端JOIN
-- 到底多大的表才能称为小表
set hive.mapjoin.smalltable.filesize=25000000;
-- 约等于23.8M
-- WHERE:后面通常是表达式,还可以是非聚合函数表达式,不能跟聚合函数
SELECT
COUNT(1)
FROM sz2002.u1
HAVING COUNT(1) > 2
;
-- GROUP BY 分组,通常与聚合函数一起搭配使用
-- 查询的字段要么出现在GROUP BY之后,要么出现在聚合函数之内
SELECT id,name,count(1)
FROM sz2002.u1
GROUP BY id ,name
;
-- HAVING:对分组之后的聚合的结果进行过滤
-- 可以使用别名,是别名所代表的计算只能是一层的计算
-- HAVING之后的过滤字段必须出现在SELECT列表中
SELECT
id,count(1)
FROM sz2002.u1
GROUP BY id
HAVING id = 1
;
-- ORDER BY :全局排序,保证所有reduce中的数据有序
-- SORT BY :局部排序,只保证单个reduce中的数据有序
-- 如果reduce的个数只有1个,两者之间无差别
-- 两者都通常和ASC|DESC搭配,默认使用ASC
-- 设置于reduce的数量
set mapreduce.job.reduces=-1;
-- -1默认是根据最终的数据量来确定reduce的数量
-- 256M
-- 只要使用order by,reduce的个数永远是一个
-- LIMIT :从结果集中取出数据的条数
-- UNION : 将多个结果集合并,去重加排序
-- UNION ALL : 将多个结果集合并,不去重不排序
-- 如果数据集中能确定没有重复的数据,尽量使用UNION ALL
--- 练习
CREATE TABLE IF NOT EXISTS sz2002.course(
c_id VARCHAR(20),
c_name VARCHAR(20),
t_id VARCHAR(20))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
;
CREATE TABLE IF NOT EXISTS sz2002.student(
s_id VARCHAR(20),
s_name VARCHAR(20),
s_birth VARCHAR(20),
s_sex VARCHAR(10)
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
;
CREATE TABLE IF NOT EXISTS sz2002.teacher(
t_id VARCHAR(20),
t_name VARCHAR(20)
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
;
CREATE TABLE IF NOT EXISTS sz2002.score(
s_id VARCHAR(20),
c_id VARCHAR(20),
s_score int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
;
-- 加载数据
LOAD DATA LOCAL INPATH '/hivedata/course.txt' INTO TABLE sz2002.course;
LOAD DATA LOCAL INPATH '/hivedata/stu.txt' INTO TABLE sz2002.student;
LOAD DATA LOCAL INPATH '/hivedata/teacher.txt' INTO TABLE sz2002.teacher;
LOAD DATA LOCAL INPATH '/hivedata/score.txt' INTO TABLE sz2002.score;
-- 查看数据
SELECT
*
FROM sz2002.score
;
-- 输入目录 -- 表
-- 输出字段 -- 需求
-- 条件
-- 1、查询"01"课程比"02"课程成绩高的学生的信息及课程分数
SELECT
stu.s_id,
stu.s_name,
stu.s_birth,
stu.s_sex,
cs.c_name,
sc.s_score
FROM
(
SELECT
*
FROM sz2002.score
WHERE c_id = '01'
) sc01
INNER JOIN
(
SELECT
*
FROM sz2002.score
WHERE c_id = '02'
) sc02
ON sc01.s_id = sc02.s_id
JOIN sz2002.student stu
ON stu.s_id = sc01.s_id
JOIN sz2002.score sc
ON sc.s_id = stu.s_id
JOIN sz2002.course cs
ON cs.c_id = sc.c_id
WHERE sc01.s_score > sc02.s_score
;
-- 去掉子查询
SELECT
stu.s_id,
stu.s_name,
stu.s_birth,
stu.s_sex,
cs.c_name,
sc.s_score
FROM sz2002.student stu
JOIN sz2002.score sc01
ON sc01.c_id = '01' AND stu.s_id = sc01.s_id
LEFT JOIN sz2002.score sc02
ON sc02.c_id = '02' AND stu.s_id = sc02.s_id
JOIN sz2002.score sc
ON sc.s_id = stu.s_id
JOIN sz2002.course cs
ON cs.c_id = sc.c_id
WHERE sc01.s_score > sc02.s_score OR sc02.s_score IS NULL
;
SELECT
FROM
JOIN
SELECT
*
FROM sz2002.student stu
JOIN sz2002.score sc01
ON sc01.c_id = '01' AND stu.s_id = sc01.s_id
LEFT JOIN sz2002.score sc02
ON sc02.c_id = '02' AND stu.s_id = sc02.s_id
WHERE sc01.s_score > sc02.s_score OR sc02.s_score IS NULL
;
SELECT
sc01.s_id
FROM sz2002.score sc01
WHERE c_id = '01' AND EXISTS
(
SELECT 1
FROM sz2002.score sc02
WHERE sc02.c_id = '02' AND sc02.s_id = sc01.s_id
) AND sc01.s_score = sc02.s_score
;
-- 查询"01"课程比"02"课程成绩低的学生的信息及课程分数:
SELECT
stu.s_id,
stu.s_name,
stu.s_birth,
stu.s_sex,
cs.c_name,
sc.s_score
FROM sz2002.student stu
JOIN sz2002.score sc01
ON sc01.c_id = '01' AND stu.s_id = sc01.s_id
RIGHT JOIN sz2002.score sc02
ON sc02.c_id = '02' AND stu.s_id = sc02.s_id
JOIN sz2002.score sc
ON sc.s_id = stu.s_id
JOIN sz2002.course cs
ON cs.c_id = sc.c_id
WHERE sc01.s_score < sc02.s_score OR sc01.s_score IS NULL
;
-- 查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩:
SELECT
stu.s_id,
stu.s_name,
avg(sc.s_score) avg_score
FROM sz2002.student stu
JOIN sz2002.score sc
ON sc.s_id = stu.s_id
GROUP BY
stu.s_id,
stu.s_name
HAVING avg_score >= 60
;
-- 查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩:
SELECT
stu.s_id,
stu.s_name,
avg(sc.s_score) avg_score
FROM sz2002.student stu
JOIN sz2002.score sc
ON sc.s_id = stu.s_id
GROUP BY
stu.s_id,
stu.s_name
HAVING avg_score < 60
UNION ALL
-- 学生没有选修课程的情况
SELECT
stu.s_id,
stu.s_name,
NULL avg_score
FROM sz2002.student stu
WHERE NOT EXISTS
(SELECT 1 FROM sz2002.score sc WHERE sc.s_id = stu.s_id)
;
SELECT
stu.s_id,
stu.s_name,
NULL avg_score
FROM sz2002.student stu
LEFT JOIN sz2002.score sc
ON sc.s_id = stu.s_id
WHERE sc.s_id IS NULL
;
SELECT
stu.s_id,
stu.s_name,
avg(nvl(sc.s_score,0)) avg_score
FROM sz2002.student stu
LEFT JOIN sz2002.score sc
ON sc.s_id = stu.s_id
GROUP BY
stu.s_id,
stu.s_name
HAVING avg_score < 60
;
-- 查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩:
SELECT
stu.s_id,
stu.s_name,
count(*) as cnt1,
count(sc.c_id) as cnt2,
count(1) cnt3,
sum(nvl(sc.s_score,0)) sum_score
FROM sz2002.student stu
LEFT JOIN sz2002.score sc
ON sc.s_id = stu.s_id
GROUP BY
stu.s_id,
stu.s_name
;
SELECT
*,
1 as custom_column
FROM sz2002.student stu
LEFT JOIN sz2002.score sc
ON sc.s_id = stu.s_id
;
-- 查询学过"张三"老师授课的同学的信息:
SELECT
stu.s_id,
stu.s_name
FROM sz2002.student stu
JOIN sz2002.score sc
ON sc.s_id = stu.s_id
JOIN sz2002.course cs
ON sc.c_id = cs.c_id
JOIN sz2002.teacher tea
ON cs.t_id = tea.t_id
WHERE tea.t_name = '张三'
GROUP BY
stu.s_id,
stu.s_name
;
-- EXISTS
SELECT
stu.*
FROM sz2002.student stu
WHERE EXISTS
(SELECT 1
FROM sz2002.score sc
JOIN sz2002.course cs
ON sc.c_id = cs.c_id
JOIN sz2002.teacher tea
ON tea.t_name = '张三' AND tea.t_id = cs.t_id
WHERE sc.s_id = stu.s_id)
;
-- 查询学过"张三"老师所有课程的同学的信息:
-- 先假设学生选修了所有的张三的课程
-- 求证在成绩表中是否能找到学生的 选修成绩
-- 综合条件
-- 反向统计 -- 如果成绩是NULL,计1,否则,计0
-- 统计的结果=0
SELECT
stu.s_id,
stu.s_name
FROM sz2002.student stu
JOIN sz2002.course cs
JOIN sz2002.teacher tea
ON cs.t_id = tea.t_id
LEFT JOIN sz2002.score sc
ON sc.s_id = stu.s_id AND sc.c_id = cs.c_id
WHERE tea.t_name = '张三'
GROUP BY
stu.s_id,
stu.s_name
HAVING sum(case when sc.s_score IS NULL then 1 else 0 end) = 0
;
-- 条件