hive语法

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

hive的执行模式

1、 客户端(也就是黑窗口)
2、 hive -e ‘hql’
3、 hive -f sql.hql(指定一个脚本执行)
hive -S -f sql.hql 没有提示信息 (静默模式)

hive-help (linux下的命令,查看hive的帮助文档)

-- linux下的命令,查看hive的使用帮助

hive -help

usage: hive
 -d,--define <key=value>          Variable subsitution to apply to hive
                                  commands. e.g. -d A=B or --define A=B
    --database <databasename>     Specify the database to use
 -e <quoted-query-string>         SQL from command line
 -f <filename>                    SQL from files
 -H,--help                        Print help information
    --hiveconf <property=value>   Use value for given property
    --hivevar <key=value>         Variable subsitution to apply to hive
                                  commands. e.g. --hivevar A=B
 -i <filename>                    Initialization SQL file
 -S,--silent                      Silent mode in interactive shell
 -v,--verbose                     Verbose mode (echo executed SQL to the
                                  console)

查询语法

在这里插入图片描述

语句执行顺序

在这里插入图片描述
在这里插入图片描述

查询规则

-- 规则
-- 尽量不使用子查询、不使用in not in  exists not exists
SELECT
*
from t1
where id in (select id from t2 where .... )
;


SELECT
*
from t1
where exists (select 1 as column_1 from t2 where t2.id = t1.id )
;

-- 查询尽量避免join,但是无可避免

-- 中间表

-- 查询永远都是小表驱动大表

--- 学生选修了所有课程

123
course

score
-- 学生在成绩表里存在每一门课程的成绩

exists
select
*
from stu
where exists (select 1 from course left join score on cid = cid and sid = sid )

select id from score where sid =''
;

join

-- JOIN 
-- 实体和关系
-- 连接JOIN
-- 表的连接分为:
-- 内连接 INNER JOIN
-- 外连接 OUTER JOIN 
-- 左连接 LEFT JOIN 
-- 右连接 RIGHT JOIN
-- 全连接 FULL JOIN
-- 笛卡尔积 CROSS JOIN

-- 特殊类型
-- LEFT SEMI JOIN 

-- JOIN的类型
-- LEFT JOIN \LEFT OUTER JOIN \ RIGHT JOIN \ RIGHT OUTER JOIN \ INNER JOIN \ FULL OUTER JOIN 

-- 测试数据
a表
1,a1
2,a2
3,a3
4,a4
5,a5
6,a6
7,a7
8,a8

b表
2,liushuai
4,gaoyuanyuan
5,jiajingwen
7,gaogao
9,hetu

CREATE TABLE IF NOT EXISTS sz2002.u1(
id int,
name string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
;

CREATE TABLE IF NOT EXISTS sz2002.u2(
id int,
name string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
;

-- 加载数据
LOAD DATA LOCAL INPATH '/hivedata/u1.dat' INTO TABLE sz2002.u1;
LOAD DATA LOCAL INPATH '/hivedata/u2.dat' INTO TABLE sz2002.u2;


SELECT 
*
FROM sz2002.u2
;


-- 内连接
-- INNER JOIN 
-- ,
-- JOIN
-- 以上3种格式,结果都差不多,必须相互间能连接上才能返回结果

set hive.exec.mode.local.auto=true;

SELECT
*
FROM sz2002.u1
INNER JOIN sz2002.u2 
ON u1.id = u2.id
;

SELECT 
*
FROM sz2002.u1,sz2002.u2
 WHERE u1.id = u2.id
;

-- JOIN 不加任何ON条件或者WHERE条件时,称之为笛卡尔积

-- LEFT JOIN \ LEFT OUTER JOIN 
-- 数据以左表为基准,左表存在的数据都能查询出,右表的数据如果能连接上就能查询出,如果连接不上则以NULL来代替

SELECT
*
FROM sz2002.u1 
LEFT OUTER JOIN sz2002.u2
ON u1.id = u2.id
;

-- 场景:A表有结果,但是B表为Null

SELECT
*
FROM sz2002.u1 
LEFT OUTER JOIN sz2002.u2
ON u1.id = u2.id
WHERE u2.id IS NULL
;


SELECT 
*
FROM sz2002.u1
WHERE NOT EXISTS (SELECT 1 FROM sz2002.u2 WHERE u2.id =u1.id)
;

-- RIGHT JOIN \ RIGHT OUTER JOIN 
-- 以右表为准,右表的结果全都能查询出,然后匹配左表的信息,如果左表能匹配上,则返回左表的信息,否则以NULL来代替左表
SELECT
u1.*,u2.*
FROM sz2002.u1 
RIGHT OUTER JOIN sz2002.u2
ON u1.id = u2.id
;

SELECT
u1.*,u2.*
FROM sz2002.u2 
LEFT OUTER JOIN sz2002.u1
ON u1.id = u2.id
;


-- FULL OUTER JOIN 
-- 相互进行连接,左表累加右表,如果某一张表上的数据连接不上,则使用NULL来代替

SELECT
u1.*,u2.*
FROM sz2002.u2 
FULL OUTER JOIN sz2002.u1
ON u1.id = u2.id
;

-- LEFT SEMI JOIN 
-- semi jojn 也叫半开连接,通常是left join的一种优化,只能查询出左表的信息,主要用来判断左表是否存在

SELECT
*
FROM sz2002.u1
LEFT SEMI JOIN sz2002.u2
ON u1.id = u2.id
;

SELECT
u1.*
FROM sz2002.u1
INNER JOIN sz2002.u2
ON u1.id = u2.id
;

-- exists

SELECT
*
FROM sz2002.u1
WHERE EXISTS (SELECT 1 FROM sz2002.u2 WHERE U2.ID = U1.ID)
;

-- hive提供小表标识:使用STREAMTABLE(小表的别名)

SELECT
/*+STREAMTABLE(d)*/
*
FROM sz2002.u1 e
INNER JOIN sz2002.u2 d
ON e.id = d.id
;

-- 子查询:hive对子查询支持不是很友好,特别是=问题很多

SELECT
*
FROM sz2002.u1
WHERE id = (SELECT id FROM sz2002.u2 WHERE id = 3)
;

-- INNER JOIN 和 OUTER JOIN的区别
-- 分区字段对OUTER JOIN 的ON条件无效,对INNER JOIN 中的ON条件有效
-- 有INNER JOIN 但是没有FULL INNER JOIN 
-- 有FULL OUTER JOIN 但是没有OUTER JOIN

-- 所有的JOIN连接,都只支持等值连接(=和and) 不支持!= > < ... or 

-- map-side join
-- 会将小表缓存到内存中,然后在map端就进行关联查找。hive在map查找时会减少查询量,减少大量的shuffle过程和数据传输的时间

set hive.auto.convert.join = true;

-- 以前的版本,可以添加/*+MAPJOIN(小表别名)*/来标识map端JOIN

-- 到底多大的表才能称为小表

set hive.mapjoin.smalltable.filesize=25000000;

-- 约等于23.8M


-- WHERE:后面通常是表达式,还可以是非聚合函数表达式,不能跟聚合函数

SELECT
COUNT(1)
FROM sz2002.u1
HAVING COUNT(1) > 2
;

-- GROUP BY 分组,通常与聚合函数一起搭配使用
-- 查询的字段要么出现在GROUP BY之后,要么出现在聚合函数之内
SELECT id,name,count(1)
FROM sz2002.u1
GROUP BY id ,name
;

-- HAVING:对分组之后的聚合的结果进行过滤
-- 可以使用别名,是别名所代表的计算只能是一层的计算
-- HAVING之后的过滤字段必须出现在SELECT列表中


SELECT
id,count(1)
FROM sz2002.u1
GROUP BY id 
HAVING id = 1
;

-- ORDER BY :全局排序,保证所有reduce中的数据有序
-- SORT BY :局部排序,只保证单个reduce中的数据有序
-- 如果reduce的个数只有1个,两者之间无差别
-- 两者都通常和ASC|DESC搭配,默认使用ASC

-- 设置于reduce的数量

set mapreduce.job.reduces=-1;

-- -1默认是根据最终的数据量来确定reduce的数量
-- 256M

-- 只要使用order by,reduce的个数永远是一个

-- LIMIT :从结果集中取出数据的条数
-- UNION : 将多个结果集合并,去重加排序
-- UNION ALL : 将多个结果集合并,不去重不排序
-- 如果数据集中能确定没有重复的数据,尽量使用UNION ALL

 

--- 练习

CREATE TABLE IF NOT EXISTS sz2002.course(
c_id VARCHAR(20),
c_name VARCHAR(20),
t_id VARCHAR(20))
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t'
;

CREATE TABLE IF NOT EXISTS sz2002.student(
s_id VARCHAR(20),
s_name VARCHAR(20),
s_birth VARCHAR(20),
s_sex VARCHAR(10)
)
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t'
;

CREATE TABLE IF NOT EXISTS sz2002.teacher(
t_id VARCHAR(20),
t_name VARCHAR(20)
)
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t'
;

CREATE TABLE IF NOT EXISTS sz2002.score(
s_id VARCHAR(20),
c_id VARCHAR(20),
s_score int
)
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t'
;


-- 加载数据
LOAD DATA LOCAL INPATH '/hivedata/course.txt' INTO TABLE sz2002.course;
LOAD DATA LOCAL INPATH '/hivedata/stu.txt' INTO TABLE sz2002.student;
LOAD DATA LOCAL INPATH '/hivedata/teacher.txt' INTO TABLE sz2002.teacher;
LOAD DATA LOCAL INPATH '/hivedata/score.txt' INTO TABLE sz2002.score;


-- 查看数据

SELECT 
*
FROM sz2002.score
;

-- 输入目录 -- 表
-- 输出字段 -- 需求
-- 条件

--  1、查询"01"课程比"02"课程成绩高的学生的信息及课程分数

SELECT
stu.s_id,
stu.s_name,
stu.s_birth,
stu.s_sex,
cs.c_name,
sc.s_score
FROM
(
SELECT
*
FROM sz2002.score
WHERE c_id = '01'
) sc01
INNER JOIN 
(
SELECT
*
FROM sz2002.score
WHERE c_id = '02'
) sc02 
ON sc01.s_id = sc02.s_id
JOIN sz2002.student stu
ON stu.s_id = sc01.s_id
JOIN sz2002.score sc 
ON sc.s_id = stu.s_id
JOIN sz2002.course cs 
ON cs.c_id = sc.c_id
WHERE sc01.s_score > sc02.s_score
;

-- 去掉子查询

SELECT
stu.s_id,
stu.s_name,
stu.s_birth,
stu.s_sex,
cs.c_name,
sc.s_score
FROM sz2002.student stu
JOIN sz2002.score sc01 
ON sc01.c_id = '01' AND stu.s_id = sc01.s_id
LEFT JOIN sz2002.score sc02
ON sc02.c_id = '02' AND stu.s_id = sc02.s_id 
JOIN sz2002.score sc 
ON sc.s_id = stu.s_id
JOIN sz2002.course cs 
ON cs.c_id = sc.c_id
WHERE sc01.s_score > sc02.s_score OR sc02.s_score IS NULL
;

SELECT
FROM 
JOIN 


SELECT
*
FROM sz2002.student stu
JOIN sz2002.score sc01 
ON sc01.c_id = '01' AND stu.s_id = sc01.s_id
LEFT JOIN sz2002.score sc02
ON sc02.c_id = '02' AND stu.s_id = sc02.s_id 
WHERE sc01.s_score > sc02.s_score OR sc02.s_score IS NULL
;

SELECT 
sc01.s_id
FROM sz2002.score sc01 
WHERE c_id = '01' AND EXISTS 
(
SELECT 1 
FROM sz2002.score sc02
WHERE sc02.c_id = '02' AND sc02.s_id = sc01.s_id
) AND sc01.s_score = sc02.s_score 
;


-- 查询"01"课程比"02"课程成绩低的学生的信息及课程分数:

SELECT
stu.s_id,
stu.s_name,
stu.s_birth,
stu.s_sex,
cs.c_name,
sc.s_score
FROM sz2002.student stu
JOIN sz2002.score sc01 
ON sc01.c_id = '01' AND stu.s_id = sc01.s_id
RIGHT JOIN sz2002.score sc02
ON sc02.c_id = '02' AND stu.s_id = sc02.s_id 
JOIN sz2002.score sc 
ON sc.s_id = stu.s_id
JOIN sz2002.course cs 
ON cs.c_id = sc.c_id
WHERE sc01.s_score < sc02.s_score OR sc01.s_score IS NULL
;

-- 查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩:

SELECT 
stu.s_id,
stu.s_name,
avg(sc.s_score) avg_score
FROM sz2002.student stu
JOIN sz2002.score sc 
ON sc.s_id = stu.s_id 
GROUP BY 
stu.s_id,
stu.s_name
HAVING avg_score >= 60
;


-- 查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩:

SELECT 
stu.s_id,
stu.s_name,
avg(sc.s_score) avg_score
FROM sz2002.student stu
JOIN sz2002.score sc 
ON sc.s_id = stu.s_id 
GROUP BY 
stu.s_id,
stu.s_name
HAVING avg_score < 60
UNION ALL 
-- 学生没有选修课程的情况

SELECT
stu.s_id,
stu.s_name,
NULL avg_score
FROM sz2002.student stu
WHERE NOT EXISTS 
(SELECT 1 FROM sz2002.score sc WHERE sc.s_id = stu.s_id)
; 

SELECT
stu.s_id,
stu.s_name,
NULL avg_score
FROM sz2002.student stu
LEFT JOIN sz2002.score sc 
ON sc.s_id = stu.s_id
WHERE sc.s_id IS NULL
; 


SELECT 
stu.s_id,
stu.s_name,
avg(nvl(sc.s_score,0)) avg_score
FROM sz2002.student stu
LEFT JOIN sz2002.score sc 
ON sc.s_id = stu.s_id 
GROUP BY 
stu.s_id,
stu.s_name
HAVING avg_score < 60
;


-- 查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩:

SELECT
stu.s_id,
stu.s_name,
count(*) as cnt1,
count(sc.c_id) as cnt2,
count(1) cnt3,
sum(nvl(sc.s_score,0)) sum_score
FROM sz2002.student stu 
LEFT JOIN sz2002.score sc 
ON sc.s_id = stu.s_id
GROUP BY 
stu.s_id,
stu.s_name
;

SELECT 
*,
1 as custom_column
FROM sz2002.student stu 
LEFT JOIN sz2002.score sc 
ON sc.s_id = stu.s_id
;

-- 查询学过"张三"老师授课的同学的信息:

SELECT 
stu.s_id,
stu.s_name
FROM sz2002.student stu 
JOIN sz2002.score sc
ON sc.s_id = stu.s_id
JOIN sz2002.course cs 
ON sc.c_id = cs.c_id
JOIN sz2002.teacher tea 
ON cs.t_id = tea.t_id 
WHERE tea.t_name = '张三'
GROUP BY 
stu.s_id,
stu.s_name
;

-- EXISTS

SELECT 
stu.*
FROM sz2002.student stu 
WHERE EXISTS 
(SELECT 1
FROM sz2002.score sc 
JOIN sz2002.course cs 
ON sc.c_id = cs.c_id
JOIN sz2002.teacher tea 
ON tea.t_name = '张三' AND tea.t_id = cs.t_id
WHERE sc.s_id = stu.s_id)
;



-- 查询学过"张三"老师所有课程的同学的信息:

-- 先假设学生选修了所有的张三的课程
-- 求证在成绩表中是否能找到学生的 选修成绩
-- 综合条件
-- 反向统计 -- 如果成绩是NULL,计1,否则,计0
-- 统计的结果=0 

SELECT 
stu.s_id,
stu.s_name
FROM sz2002.student stu 
JOIN sz2002.course cs 
JOIN sz2002.teacher tea 
ON cs.t_id = tea.t_id 
LEFT JOIN sz2002.score sc
ON sc.s_id = stu.s_id AND sc.c_id = cs.c_id
WHERE tea.t_name = '张三'
GROUP BY
stu.s_id,
stu.s_name
HAVING sum(case when sc.s_score IS NULL then 1 else 0 end) = 0
;

-- 条件

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值