hive第二天
静态分区
分区字段一个, 一级分区 , 分区字段有多个为多级分区
二级分区
分区字段为两个
create table tb_partition(
id int ,
name string ,
ct string
)
partitioned by (y string , m string) -- 二级分区
row format delimited fields terminated by ',' ;
练习1:
2021-02-20.log
1,click,2021-02-20
2,show,2021-02-20
3,add,2021-02-20
2021-02-21.log
4,click,2021-02-21
5,show,2021-02-21
6,add,2021-02-21
2021-03-11.log
1,click,2021-03-11
2,show,2021-03-11
3,add,2021-03-11
2020-03-12.log
1,click,2020-03-12
2,show,2020-03-12
3,add,2020-03-12
2020-04-13.log
1,click,2020-04-13
2,show,2020-04-13
3,add,2020-04-13
导入数据
load data local inpath '/data/log/2021-02-20.log' into table tb_partition partition(y='2021' , m='02');
load data local inpath '/data/log/2021-02-21.log' into table tb_partition partition(y='2021' , m='02');
load data local inpath '/data/log/2021-03-11.log' into table tb_partition partition(y='2021' , m='03');
load data local inpath '/data/log/2020-03-12.log' into table tb_partition partition(y='2020' , m='03');
load data local inpath '/data/log/2020-04-13.log' into table tb_partition partition(y='2020' , m='04');
查询2020年的数据几种法方
--- 查询2020年的数据
select
*
from
tb_partition
where substr(ct , 0 , 4) = '2020' ;
+------------------+--------------------+------------------+-----------------+-----------------+
| tb_partition.id | tb_partition.name | tb_partition.ct | tb_partition.y | tb_partition.m |
+------------------+--------------------+------------------+-----------------+-----------------+
| 1 | click | 2020-03-12 | 2020 | 03 |
| 2 | show | 2020-03-12 | 2020 | 03 |
| 3 | add | 2020-03-12 | 2020 | 03 |
| 1 | click | 2020-04-13 | 2020 | 04 |
| 2 | show | 2020-04-13 | 2020 | 04 |
| 3 | add | 2020-04-13 | 2020 | 04 |
+------------------+--------------------+------------------+-----------------+-----------------+
select
*
from
tb_partition
where year(ct) = '2020' ;
+------------------+--------------------+------------------+-----------------+-----------------+
| tb_partition.id | tb_partition.name | tb_partition.ct | tb_partition.y | tb_partition.m |
+------------------+--------------------+------------------+-----------------+-----------------+
| 1 | click | 2020-03-12 | 2020 | 03 |
| 2 | show | 2020-03-12 | 2020 | 03 |
| 3 | add | 2020-03-12 | 2020 | 03 |
| 1 | click | 2020-04-13 | 2020 | 04 |
| 2 | show | 2020-04-13 | 2020 | 04 |
| 3 | add | 2020-04-13 | 2020 | 04 |
+------------------+--------------------+------------------+-----------------+-----------------+
select * from tb_partition where ct like '2020%';
+------------------+--------------------+------------------+-----------------+-----------------+
| tb_partition.id | tb_partition.name | tb_partition.ct | tb_partition.y | tb_partition.m |
+------------------+--------------------+------------------+-----------------+-----------------+
| 1 | click | 2020-03-12 | 2020 | 03 |
| 2 | show | 2020-03-12 | 2020 | 03 |
| 3 | add | 2020-03-12 | 2020 | 03 |
| 1 | click | 2020-04-13 | 2020 | 04 |
| 2 | show | 2020-04-13 | 2020 | 04 |
| 3 | add | 2020-04-13 | 2020 | 04 |
+------------------+--------------------+------------------+-----------------+-----------------+
select * from tb_partition where y = '2020' ;
+------------------+--------------------+------------------+-----------------+-----------------+
| tb_partition.id | tb_partition.name | tb_partition.ct | tb_partition.y | tb_partition.m |
+------------------+--------------------+------------------+-----------------+-----------------+
| 1 | click | 2020-04-13 | 2020 | 02 |
| 2 | show | 2020-04-13 | 2020 | 02 |
| 3 | add | 2020-04-13 | 2020 | 02 |
| 1 | click | 2020-03-12 | 2020 | 03 |
| 2 | show | 2020-03-12 | 2020 | 03 |
| 3 | add | 2020-03-12 | 2020 | 03 |
+------------------+--------------------+------------------+-----------------+-----------------+
explain
select
*
from
tb_partition
where substr(ct , 0 , 4) = '2020' ;
+----------------------------------------------------+
| Explain |
+----------------------------------------------------+
| STAGE DEPENDENCIES: |
| Stage-0 is a root stage |
| |
| STAGE PLANS: |
| Stage: Stage-0 |
| Fetch Operator |
| limit: -1 |
| Processor Tree: |
| TableScan |
| alias: tb_partition |
| Statistics: Num rows: 1 Data size: 2700 Basic stats: PARTIAL Column stats: NONE |
| Filter Operator |
| predicate: (substr(ct, 0, 4) = '2020') (type: boolean) |
| Statistics: Num rows: 1 Data size: 2700 Basic stats: PARTIAL Column stats: NONE |
| Select Operator |
| expressions: id (type: int), name (type: string), ct (type: string), y (type: string), m (type: string) |
| outputColumnNames: _col0, _col1, _col2, _col3, _col4 |
| Statistics: Num rows: 1 Data size: 2700 Basic stats: PARTIAL Column stats: NONE |
| ListSink |
| |
+----------------------------------------------------+
- substr substring
- year
- desc function year – 查看函数的帮助文档
- show functions 创建系统中支持的函数
查询语言
select
1,
"hello" ,
current_date() ,
upper(name) ,
account_name as name
age+10
from
tb_name
where id > and or in not between and
[group by col1 , col2 聚合函数
having ...分租后的数据筛选
]
order by desc
limit ....
练习01
dept.txt
10 ACCOUNTING 1700
20 RESEARCH 1800
30 SALES 1900
40 OPERATIONS 1700
emp.txt
7369 SMITH CLERK 7902 1980-12-17 800.00 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.00 300.00 30
7521 WARD SALESMAN 7698 1981-2-22 1250.00 500.00 30
7566 JONES MANAGER 7839 1981-4-2 2975.00 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.00 1400.00 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.00 30
7782 CLARK MANAGER 7839 1981-6-9 2450.00 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.00 20
7839 KING PRESIDENT 1981-11-17 5000.00 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.00 0.00 30
7876 ADAMS CLERK 7788 1987-5-23 1100.00 20
7900 JAMES CLERK 7698 1981-12-3 950.00 30
7902 FORD ANALYST 7566 1981-12-3 3000.00 20
7934 MILLER CLERK 7782 1982-1-23 1300.00 10
create table if not exists dept(
deptno int,
dname string,
loc int
)
row format delimited fields terminated by '\t';
load data local inpath "/data/dept.txt" into table dept ;
create table if not exists emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int)
row format delimited fields terminated by '\t';
load data local inpath "/data/emp.txt" into table emp ;
-- 统计每个部门下, 每个岗位薪资最高的那个人的信息
-- 部门名 工作 姓名 工资
select
deptno ,
job ,
max(sal+comm) as max_sal -- null和任何数值算数运算都是null
from
emp
group by deptno , job ;
select
deptno ,
job ,
max(sal+if(isnull(comm),0,comm)) as max_sal
from
emp
group by deptno , job ;
+---------+------------+----------+
| deptno | job | max_sal |
+---------+------------+----------+
| 10 | CLERK | 1300.0 |
| 10 | MANAGER | 2450.0 |
| 10 | PRESIDENT | 5000.0 |
| 20 | ANALYST | 3000.0 |
| 20 | CLERK | 1100.0 |
| 20 | MANAGER | 2975.0 |
| 30 | CLERK | 950.0 |
| 30 | MANAGER | 2850.0 |
| 30 | SALESMAN | 2650.0 |
+---------+------------+----------+
-- nvl 对null值处理的函数
select
deptno ,
job ,
max(sal+nvl(comm,0)) as max_sal
from
emp
group by deptno , job ;
+---------+------------+----------+
| deptno | job | max_sal |
+---------+------------+----------+
| 10 | CLERK | 1800.0 |
| 10 | MANAGER | 2450.0 |
| 10 | PRESIDENT | 5000.0 |
| 20 | ANALYST | 3000.0 |
| 20 | CLERK | 1100.0 |
| 20 | MANAGER | 2975.0 |
| 30 | CLERK | 950.0 |
| 30 | MANAGER | 2850.0 |
| 30 | SALESMAN | 2650.0 |
+---------+------------+----------+
select emp.ename,emp.job,emp.deptno,emp.sal,emp.comm,t.max_sal
from(select
deptno ,
job ,
max(sal+nvl(comm,0)) as max_sal
from
emp
group by deptno , job )t
join emp on emp.deptno=t.deptno and
emp.job=t.job and (emp.sal+nvl(comm,0))=t.max_sal;
+------------+------------+-------------+----------+-----------+------------+
| emp.ename | emp.job | emp.deptno | emp.sal | emp.comm | t.max_sal |
+------------+------------+-------------+----------+-----------+------------+
| BENGE | CLERK | 10 | 1800.0 | NULL | 1800.0 |
| CLARK | MANAGER | 10 | 2450.0 | NULL | 2450.0 |
| KING | PRESIDENT | 10 | 5000.0 | NULL | 5000.0 |
| SCOTT | ANALYST | 20 | 3000.0 | NULL | 3000.0 |
| FORD | ANALYST | 20 | 3000.0 | NULL | 3000.0 |
| ADAMS | CLERK | 20 | 1100.0 | NULL | 1100.0 |
| JONES | MANAGER | 20 | 2975.0 | NULL | 2975.0 |
| JAMES | CLERK | 30 | 950.0 | NULL | 950.0 |
| BLAKE | MANAGER | 30 | 2850.0 | NULL | 2850.0 |
| MARTIN | SALESMAN | 30 | 1250.0 | 1400.0 | 2650.0 |
+------------+------------+-------------+----------+-----------+------------+
select
dept.dname ,
t2.job ,
t2.ename ,
t2.sal ,
t2.comm ,
t2.max_sal as total_sal
from
dept
join
(
select
emp.ename ,
emp.job ,
emp.deptno ,
emp.sal ,
emp.comm,
t.max_sal
from
(
select
deptno ,
job ,
max(sal+nvl(comm,0)) as max_sal -- null和任何数值算数运算都是null
from
emp
group by deptno , job
) as t
join
emp
on emp.deptno = t.deptno and emp.job=t.job and (emp.sal+nvl(emp.comm,0)) = t.max_sal
)t2
on t2.deptno = dept.deptno ;
select
dept.dname ,
t2.job ,
t2.ename ,
t2.sal ,
t2.comm ,
t2.max_sal as total_sal
from
dept
join
(
select
emp.ename ,
emp.job ,
emp.deptno ,
emp.sal ,
emp.comm,
t.max_sal
from
(
select
deptno ,
job ,
max(sal+nvl(comm,0)) as max_sal -- null和任何数值算数运算都是null
from
emp
group by deptno , job
) as t
join
emp
on emp.deptno = t.deptno and emp.job=t.job and (emp.sal+nvl(emp.comm,0)) = t.max_sal
)t2
on t2.deptno = dept.deptno ;
case when
case
when expretion then
when expretion then
when expretion then
when expretion then
end as
case col
when val1 then
when val2 then
when val3 then
when val4 then
else
end as
标注有奖金的人
select
* ,
if(comm is null , 0 ,1)
from
emp ;
– 给sal字段的值的范围 分类
–case when
select
ename ,
sal ,
case
when sal > 0 and sal <=1000 then '0~1000'
when sal>1000 and sal <=2000 then '1000~2000'
when sal >2000 and sal <= 3000 then '2000~3000'
else '3000~~'
end as stage
from
emp ;
+---------+---------+------------+
| ename | sal | stage |
+---------+---------+------------+
| SMITH | 800.0 | 0~1000 |
| ALLEN | 1600.0 | 1000~2000 |
| WARD | 1250.0 | 1000~2000 |
| JONES | 2975.0 | 2000~3000 |
| MARTIN | 1250.0 | 1000~2000 |
| BLAKE | 2850.0 | 2000~3000 |
| CLARK | 2450.0 | 2000~3000 |
| SCOTT | 3000.0 | 2000~3000 |
| KING | 5000.0 | 3000~~ |
| TURNER | 1500.0 | 1000~2000 |
| ADAMS | 1100.0 | 1000~2000 |
| JAMES | 950.0 | 0~1000 |
| FORD | 3000.0 | 2000~3000 |
| MILLER | 1300.0 | 1000~2000 |
| BENGE | 1800.0 | 1000~2000 |
+---------+---------+------------+
select
ename ,
deptno ,
case deptno
when 10 then '10号部门'
when 20 then '20号部门'
when 30 then '30号部门'
end as ch
from
emp ;
+---------+---------+--------+
| ename | deptno | ch |
+---------+---------+--------+
| SMITH | 20 | 20号部门 |
| ALLEN | 30 | 30号部门 |
| WARD | 30 | 30号部门 |
| JONES | 20 | 20号部门 |
| MARTIN | 30 | 30号部门 |
| BLAKE | 30 | 30号部门 |
| CLARK | 10 | 10号部门 |
| SCOTT | 20 | 20号部门 |
| KING | 10 | 10号部门 |
| TURNER | 30 | 30号部门 |
| ADAMS | 20 | 20号部门 |
| JAMES | 30 | 30号部门 |
| FORD | 20 | 20号部门 |
| MILLER | 10 | 10号部门 |
| BENGE | 10 | 10号部门 |
+---------+---------+--------+
select
ename ,
deptno ,
case
when deptno=10 then '10号部门'
when deptno=20 then '20号部门'
when deptno=30 then '30号部门'
end as ch
from
emp ;
练习02 case when
统计每个部门下各性别的总人数
悟空 A 男
娜娜 A 男
宋宋 B 男
凤姐 A 女
热巴 B 女
慧慧 B 女
create table tb_gp(
name string ,
dname string ,
gender string
)
row format delimited fields terminated by '\t' ;
load data local inpath '/data/case.txt' into table tb_gp ;
select
dname ,
sum(if(gender='男',1,0)) m_cnt ,
sum(if(gender='女',1,0)) f_cnt
from
tb_gp
group by dname
;
+--------+--------+--------+
| dname | m_cnt | f_cnt |
+--------+--------+--------+
| A | 2 | 1 |
| B | 1 | 2 |
+--------+--------+--------+
select dname,
sum(case gender when '男' then 1 else 0 end)m_cnt,
sum(case gender when '女' then 1 else 0 end)f_cnt
from tb_gp
group by dname;
+--------+--------+--------+
| dname | m_cnt | f_cnt |
+--------+--------+--------+
| A | 2 | 1 |
| B | 1 | 2 |
+--------+--------+--------+
练习03
gz.csv
uid,jb,jj,tc,bid
1,2000,3000,5000,1
2,1000,4000,1000,2
3,5000,1000,5000,1
4,4000,300,7000,3
create table gz(uid int ,jb int ,jj int ,tc int,bid int)
row format delimited fields terminated by ',' ;
load data local inpath '/root/data/gz.csv' into table gz ;
+---------+--------+--------+--------+---------+
| gz.uid | gz.jb | gz.jj | gz.tc | gz.bid |
+---------+--------+--------+--------+---------+
| 1 | 2000 | 3000 | 5000 | 1 |
| 2 | 1000 | 4000 | 1000 | 2 |
| 3 | 5000 | 1000 | 5000 | 1 |
| 4 | 4000 | 300 | 7000 | 3 |
+---------+--------+--------+--------+---------+
bm.csv
bid,name
1,财务部
2,销售部
3,后勤部
4,技术部
create table bm(bid int ,name string )row format delimited fields terminated by ',' ;
load data local inpath '/root/data/bm.csv' into table bm ;
+---------+----------+
| bm.bid | bm.name |
+---------+----------+
| 1 | 财务部 |
| 2 | 销售部 |
| 3 | 后勤部 |
| 4 | 技术部 |
+---------+----------+
yg.csv
uid,name,age,gender
1,zss,23,M
2,lss,33,F
3,wbb,28,M
4,laona,37,F
5,naige,17,M
create table yg(uid int ,name string ,age int ,gender string )
row format delimited fields terminated by ',' ;
load data local inpath '/root/data/yg.csv' into table yg ;
+---------+----------+---------+------------+
| yg.uid | yg.name | yg.age | yg.gender |
+---------+----------+---------+------------+
| 1 | zss | 23 | M |
| 2 | lss | 33 | F |
| 3 | wbb | 28 | M |
| 4 | laona | 37 | F |
| 5 | naige | 17 | M |
+---------+----------+---------+------------+
-- 求每个人的部门名称 yg bm
select yg.name ,bm.name from gz join yg join bm
on gz.uid = yg.uid and gz.bid=bm.bid;
+----------+----------+
| yg.name | bm.name |
+----------+----------+
| zss | 财务部 |
| lss | 销售部 |
| wbb | 财务部 |
| laona | 后勤部 |
+----------+----------+
select yg.name ,bm.name from gz join yg on gz.uid =yg.uid
join bm on gz.bid = bm.bid
+----------+----------+
| yg.name | bm.name |
+----------+----------+
| zss | 财务部 |
| lss | 销售部 |
| wbb | 财务部 |
| laona | 后勤部 |
+----------+----------+
多张表关联, 关联条至少是(表个数-1)
– 每不同性别的人
select * ,case gender when 'M'then'男' when 'F'then '女'end tb_gender from yg;
+---------+----------+---------+------------+------------+
| yg.uid | yg.name | yg.age | yg.gender | ch_gender |
+---------+----------+---------+------------+------------+
| 1 | zss | 23 | M | 男 |
| 2 | lss | 33 | F | 女 |
| 3 | wbb | 28 | M | 男 |
| 4 | laona | 37 | F | 女 |
| 5 | naige | 17 | M | 男 |
+---------+----------+---------+------------+------------+
每种性别的总工资
-- 每个人的总工资
select *from yg join gz on yg.uid=gz.uid;
+---------+----------+---------+------------+---------+--------+--------+--------+---------+
| yg.uid | yg.name | yg.age | yg.gender | gz.uid | gz.jb | gz.jj | gz.tc | gz.bid |
+---------+----------+---------+------------+---------+--------+--------+--------+---------+
| 1 | zss | 23 | M | 1 | 2000 | 3000 | 5000 | 1 |
| 2 | lss | 33 | F | 2 | 1000 | 4000 | 1000 | 2 |
| 3 | wbb | 28 | M | 3 | 5000 | 1000 | 5000 | 1 |
| 4 | laona | 37 | F | 4 | 4000 | 300 | 7000 | 3 |
+---------+----------+---------+------------+---------+--------+--------+--------+---------+
select yg.name ,yg.gender ,gz.jb ,gz.jj ,gz.tc from yg join gz on yg.uid = gz.uid ;
+----------+------------+--------+--------+--------+
| yg.name | yg.gender | gz.jb | gz.jj | gz.tc |
+----------+------------+--------+--------+--------+
| zss | M | 2000 | 3000 | 5000 |
| lss | F | 1000 | 4000 | 1000 |
| wbb | M | 5000 | 1000 | 5000 |
| laona | F | 4000 | 300 | 7000 |
+----------+------------+--------+--------+--------+
select yg.name ,yg.gender ,(gz.jb +gz.jj +gz.tc) as sal from yg join gz on yg.uid = gz.uid ;
+----------+------------+--------+
| yg.name | yg.gender | sal |
+----------+------------+--------+
| zss | M | 10000 |
| lss | F | 6000 |
| wbb | M | 11000 |
| laona | F | 11300 |
+----------+------------+--------+
select yg.gender,sum(gz.jb+gz.jj+gz.tc) sum_a from yg join gz on yg.uid=gz.uid group by yg.gender;
+------------+--------+
| yg.gender | _c1 |
+------------+--------+
| F | 17300 |
| M | 21000 |
+------------+--------+
**-- 统计每个年龄段的总工资,with x as (sql查询)
select*,case
when age>20 and age <=30 then '20~30'
when age>30 and age<=40 then '30~40'
when age >40 and age <=50 then '40~50'
when age >50 and age <=60 then '50~60'
else '未成年' end as age_stage from yg;
+---------+----------+---------+------------+------------+
| yg.uid | yg.name | yg.age | yg.gender | age_stage |
+---------+----------+---------+------------+------------+
| 1 | zss | 23 | M | 20~30 |
| 2 | lss | 33 | F | 30~40 |
| 3 | wbb | 28 | M | 20~30 |
| 4 | laona | 37 | F | 30~40 |
| 5 | naige | 17 | M | 未成年 |
+---------+----------+---------+------------+------------+
with tb_tmp as(select*,case
when age>20 and age <=30 then '20~30'
when age>30 and age<=40 then '30~40'
when age >40 and age <=50 then '40~50'
when age >50 and age <=60 then '50~60'
else '未成年' end as age_stage from yg)
select tb_tmp.age_stage,sum (gz.jb+gz.jj+gz.tc) as total
from tb_tmp join gz on tb_tmp.uid =gz.uid group by tb_tmp.age_stage;
+-------------------+------------+
| tb_tmp.age_stage | total_sal |
+-------------------+------------+
| 20~30 | 21000 |
| 30~40 | 17300 |
+-------------------+------------+
– 求每个人 名字 工资组成部分中占比最高的工资类型
select *,
greatest (jb,jj,tc)high,case
when greatest (jb,jj,tc)=jb then 'jb'
when greatest (jb,jj,tc)=jj then 'jj'
when greatest (jb,jj,tc)=tc then 'tc'
end as sal_typefrom yg join gz on yg.uid=gz.uid;
+---------+--------+--------+--------+---------+--------------+-----------+
| gz.uid | gz.jb | gz.jj | gz.tc | gz.bid | highest_sal | sal_type |
+---------+--------+--------+--------+---------+--------------+-----------+
| 1 | 2000 | 3000 | 5000 | 1 | 5000 | tc |
| 2 | 1000 | 4000 | 1000 | 2 | 4000 | jj |
| 3 | 5000 | 1000 | 5000 | 1 | 5000 | jb |
| 4 | 4000 | 300 | 7000 | 3 | 7000 | tc |
+---------+--------+--------+--------+---------+--------------+-----------+
select
* ,
greatest(jb, jj ,tc) highest_sal ,
case
when highest_sal = jb then 'jb'
when highest_sal = jj then 'jj'
when highest_sal = tc then 'tc'
end as sal_type
from
gz ; //错误
select
* ,
case
when highest_sal = jb then 'jb'
when highest_sal = jj then 'jj'
when highest_sal = tc then 'tc'
end as sal_type
from(select* ,greatest(jb, jj ,tc) highest_sal from gz)t ; 可以 子查询
+--------+-------+-------+-------+--------+----------------+-----------+
| t.uid | t.jb | t.jj | t.tc | t.bid | t.highest_sal | sal_type |
+--------+-------+-------+-------+--------+----------------+-----------+
| 1 | 2000 | 3000 | 5000 | 1 | 5000 | tc |
| 2 | 1000 | 4000 | 1000 | 2 | 4000 | jj |
| 3 | 5000 | 1000 | 5000 | 1 | 5000 | jb |
| 4 | 4000 | 300 | 7000 | 3 | 7000 | tc |
+--------+-------+-------+-------+--------+----------------+-----------+
with tb_gz as (select *,
greatest (jb,jj,tc)high,
case
when greatest (jb,jj,tc)=jb then 'jb'
when greatest (jb,jj,tc)=jj then 'jj'
when greatest (jb,jj,tc)=tc then 'tc'
end as sal_type from yg join gz on yg.uid=gz.uid)
select tb_gz.high,yg.name,tb_gz.sal_type from tb_gz join yg on tb_gz.uid =yg.uid;
+----------+----------------+-------------+
| yg.name | x.highest_sal | x.sal_type |
+----------+----------------+-------------+
| zss | 5000 | tc |
| lss | 4000 | jj |
| wbb | 5000 | jb |
| laona | 7000 | tc |
+----------+----------------+-------------+
关联查询
, join inner join cross join
外连接
left join
right join
full join
left semi join
union 两次查询数据的个数和数据类型一致
排序
order by 全局数据的排序
top3 工资 1亿
distribute by 分区字段 sort by 区内排序 desc
cluster by 当分区字段和排序字段一致 并且是升序的时候
设置reduce的个数
+---------------------------+
| set |
+---------------------------+
| mapreduce.job.reduces=-1 |
+---------------------------+
1 row selected (0.022 seconds)
0: jdbc:hive2://doit01:10000> set mapreduce.job.reduces =3;
No rows affected (0.01 seconds)
0: jdbc:hive2://doit01:10000> set mapreduce.job.reduces ;
+--------------------------+
| set |
+--------------------------+
| mapreduce.job.reduces=3 |
+--------------------------+
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| emp.empno | emp.ename | emp.job | emp.mgr | emp.hiredate | emp.sal | emp.comm | emp.deptno |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
| 7698 | BLAKE | MANAGER | 7839 | 1981-5-1 | 2850.0 | NULL | 30 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-17 | 5000.0 | NULL | 10 |
| 7876 | ADAMS | CLERK | 7788 | 1987-5-23 | 1100.0 | NULL | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-9-28 | 1250.0 | 1400.0 | 30 |
| 7934 | MILLER | CLERK | 7782 | 1982-1-23 | 1300.0 | NULL | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-9-8 | 1500.0 | 0.0 | 30 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-2-20 | 1600.0 | 300.0 | 30 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-4-19 | 3000.0 | NULL | 20 |
| 7369 | SMITH | CLERK | 7902 | 1980-12-17 | 800.0 | NULL | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-3 | 950.0 | NULL | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-2-22 | 1250.0 | 500.0 | 30 |
| 4433 | BENGE | CLERK | 7782 | 1998-1-23 | 1800.0 | NULL | 10 |
| 7782 | CLARK | MANAGER | 7839 | 1981-6-9 | 2450.0 | NULL | 10 |
| 7566 | JONES | MANAGER | 7839 | 1981-4-2 | 2975.0 | NULL | 20 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-3 | 3000.0 | NULL | 20 |
+------------+------------+------------+----------+---------------+----------+-----------+-------------+
行转列 列转行
concat
concat_ws 行内
collect_list
collect_set 列式 [组内收集]
孙悟空 白羊座 A
娜娜 射手座 A
宋宋 白羊座 B
猪八戒 白羊座 A
凤姐 射手座 A
select *,concat(star,',',dname)as star_emp from starremp;
+--------------+--------------+---------------+-------------+
| tb_emp.name | tb_emp.star | tb_emp.dname | star_dname |
+--------------+--------------+---------------+-------------+
| 孙悟空 | 白羊座 | A | 白羊座,A |
| 娜娜 | 射手座 | A | 射手座,A |
| 宋宋 | 白羊座 | B | 白羊座,B |
| 猪八戒 | 白羊座 | A | 白羊座,A |
| 凤姐 | 射手座 | A | 射手座,A |
+--------------+--------------+---------------+-------------+
变成
5 rows selected (0.371 seconds)
射手座,A 娜娜|凤姐
白羊座,A 孙悟空|猪八戒
白羊座,B 宋宋
select
* ,
concat(star,',',dname) as star_dname
from
tb_emp ;
+--------------+--------------+---------------+-------------+
| tb_emp.name | tb_emp.star | tb_emp.dname | star_dname |
+--------------+--------------+---------------+-------------+
| 孙悟空 | 白羊座 | A | 白羊座,A |
| 娜娜 | 射手座 | A | 射手座,A |
| 宋宋 | 白羊座 | B | 白羊座,B |
| 猪八戒 | 白羊座 | A | 白羊座,A |
| 凤姐 | 射手座 | A | 射手座,A |
+--------------+--------------+---------------+-------------+
select
star_dname ,
collect_list(name) names
from
(
select
* ,
concat(star,',',dname) as star_dname
from
tb_emp
) t
group by star_dname
;
+-------------+----------------+
| star_dname | names |
+-------------+----------------+
| 白羊座,B | ["宋宋"] |
| 射手座,A | ["娜娜","凤姐"] |
| 白羊座,A | ["孙悟空","猪八戒"] |
+-------------+----------------+
select
star_dname ,
concat_ws("|",collect_list(name)) namestrs
from
(
select
* ,
concat(star,',',dname) as star_dname
from
tb_emp
) t
group by star_dname
;
+-------------+-----------+
| star_dname | namestrs |
+-------------+-----------+
| 白羊座,B | 宋宋 |
| 射手座,A | 娜娜|凤姐 |
| 白羊座,A | 孙悟空|猪八戒 |
+-------------+-----------+
多列的数据合并到一行 行转列
数据
《疑犯追踪》 悬疑,动作,科幻,剧情
《Lie to me》 悬疑,警匪,动作,心理,剧情
《战狼2》 战争,动作,灾难
create table tb_movie(
name string ,
types string
)
row format delimited fields terminated by '\t' ;
load data local inpath '/data/tb_movie.txt' into table tb_movie ;
+----------------+-----------------+
| tb_movie.name | tb_movie.types |
+----------------+-----------------+
| 《疑犯追踪》 | 悬疑,动作,科幻,剧情 |
| 《Lie to me》 | 悬疑,警匪,动作,心理,剧情 |
| 《战狼2》 | 战争,动作,灾难 |
+----------------+-----------------+
select
explode(split(category , ","))
from
movie ;
+------+
| col |
+------+
| 悬疑 |
| 动作 |
| 科幻 |
| 剧情 |
| 悬疑 |
| 警匪 |
| 动作 |
| 心理 |
| 剧情 |
| 战争 |
| 动作 |
| 灾难 |
+------+
join
lateral view 是一种特殊的join关联 维护 explode数据关系
select
name,
col
from
movie
lateral view
explode(split(category , ",")) t ;
+--------------+------+
| name | col |
+--------------+------+
| 《疑犯追踪》 | 悬疑 |
| 《疑犯追踪》 | 动作 |
| 《疑犯追踪》 | 科幻 |
| 《疑犯追踪》 | 剧情 |
| 《Lie to me》 | 悬疑 |
| 《Lie to me》 | 警匪 |
| 《Lie to me》 | 动作 |
| 《Lie to me》 | 心理 |
| 《Lie to me》 | 剧情 |
| 《战狼2》 | 战争 |
| 《战狼2》 | 动作 |
| 《战狼2》 | 灾难 |
+--------------+------+
select
name,
category
from
tb_movie
lateral view -- 特殊的关联 维护关联条件
explode(split(types , ",")) t as category -- 虚拟表名t;
col 默认的字段名
[tb_movie join t]
+--------------+------+
| name | category |
+--------------+------+
| 《疑犯追踪》 | 悬疑 |
| 《疑犯追踪》 | 动作 |
| 《疑犯追踪》 | 科幻 |
| 《疑犯追踪》 | 剧情 |
| 《Lie to me》 | 悬疑 |
| 《Lie to me》 | 警匪 |
| 《Lie to me》 | 动作 |
| 《Lie to me》 | 心理 |
| 《Lie to me》 | 剧情 |
| 《战狼2》 | 战争 |
| 《战狼2》 | 动作 |
| 《战狼2》 | 灾难 |
+--------------+------+
窗口函数
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94
create table tb_order(
name string ,
ct string ,
money double
)
row format delimited fields terminated by ',' ;
load data local inpath '/root/data/orders.txt' into table tb_order ;
+----------------+--------------+-----------------+
| tb_order.name | tb_order.ct | tb_order.money |
+----------------+--------------+-----------------+
| jack | 2017-01-01 | 10.0 |
| tony | 2017-01-02 | 15.0 |
| jack | 2017-02-03 | 23.0 |
| tony | 2017-01-04 | 29.0 |
| jack | 2017-01-05 | 46.0 |
| jack | 2017-04-06 | 42.0 |
| tony | 2017-01-07 | 50.0 |
| jack | 2017-01-08 | 55.0 |
| mart | 2017-04-08 | 62.0 |
| mart | 2017-04-09 | 68.0 |
| neil | 2017-05-10 | 12.0 |
| mart | 2017-04-11 | 75.0 |
| neil | 2017-06-12 | 80.0 |
| mart | 2017-04-13 | 94.0 |
+----------------+--------------+-----------------+
-- 四月份有订单的总人数
select
count(1)
from
(select
name
from
tb_order
where substr(ct,0,7) = '2017-04'
group by name ) t
;
+------+
| _c0 |
+------+
| 2 |
+------+
select
name
from
tb_order
where substr(ct,0,7) = '2017-04'
group by name ;
+-------+
| name |
+-------+
| jack |
| mart |
+-------+
select
name
from
tb_order
where substr(ct,0,7) = '2017-04'
group by name ;
select
*
from
(
select
name
from
tb_order
where substr(ct,0,7) = '2017-04'
group by name
)t1
join
(
select
count(1)
from
(select
name
from
tb_order
where substr(ct,0,7) = '2017-04'
group by name ) t
) t2 ;
+----------+---------+
| t1.name | t2._c0 |
+----------+---------+
| jack | 2 |
| mart | 2 |
+----------+---------+
-- 分组 聚合 每个人的订单个数
select name , count(1) from tb_order group by name;
+-------+------+
| name | _c1 |
+-------+------+
| jack | 5 |
| mart | 4 |
| neil | 2 |
| tony | 3 |
+-------+------+
每个人的订单个数 以及订单明细
将明细拼接成一个字符串 , 收集到数组中 , 炸裂 拼接
select name , count(1) , collect_list(concat(naem , ct , moeny)) from tb_order group by name;
默认的聚合函数执行的范围是组内 没有分组就是全部
select sum(money) from tb_order ; -- sum执行的范围是全部
select * , sum(money) over() from tb_order ; -- sum执行的范围是全部
+----------------+--------------+-----------------+---------------+
| tb_order.name | tb_order.ct | tb_order.money | sum_window_0 |
+----------------+--------------+-----------------+---------------+
| mart | 2017-04-13 | 94.0 | 661.0 |
| neil | 2017-06-12 | 80.0 | 661.0 |
| mart | 2017-04-11 | 75.0 | 661.0 |
| neil | 2017-05-10 | 12.0 | 661.0 |
| mart | 2017-04-09 | 68.0 | 661.0 |
| mart | 2017-04-08 | 62.0 | 661.0 |
| jack | 2017-01-08 | 55.0 | 661.0 |
| tony | 2017-01-07 | 50.0 | 661.0 |
| jack | 2017-04-06 | 42.0 | 661.0 |
| jack | 2017-01-05 | 46.0 | 661.0 |
| tony | 2017-01-04 | 29.0 | 661.0 |
| jack | 2017-02-03 | 23.0 | 661.0 |
| tony | 2017-01-02 | 15.0 | 661.0 |
| jack | 2017-01-01 | 10.0 | 661.0 |
+----------------+--------------+-----------------+---------------+
-- 每个人的订单总金额和订单明细 分组聚合
select * , sum(money) over(partition by name) from tb_order ; -- 指定窗口的大小 name
(1)查询在2017年4月份购买过的顾客及总人数
select *,count(1) over ()from
(select distinct name from tb_order where substr(ct,0,7)
='2017-04')t;
+-------+
| name |
+-------+
| jack |
| mart |
+-------+
+---------+-----------------+
| t.name | count_window_0 |
+---------+-----------------+
| mart | 2 |
| jack | 2 |
+---------+-----------------+