一. 表的基本操作
1. 新建数据库
create databese db_name
成功后,hive就会在/user/hive/warehouse/
下建一个文件夹: db_name.db
2. 删除数据库
drop database db_name
3. 显示数据库
show databases
4. 指定使用数据库
use db_name
5. 创建内部表
use db_name;
#(filedName, filedType)
create table table_name (id int,name string,age int);
# 设置分隔符
row format delimited;
field terminated by ',';
# 展现数据表
desc db_name.table_name
创建后,hive会在仓库目录中建一个表目录: /user/hive/warehouse/db_name.db/table_name
6. 创建外部表
create external table t_ex_1(id int,name string,age int)
row format delimited
fields terminated by ','
# 指定 hdfs上的目录
location '/user/hive/external/t_ex_1';
创建本地测试数据, user.data
1,xiaowang,28
2,xiaoli,18
3,xiaohong,23
导入hdfs:
$ hdfs dfs -mkdir -p /user/hive/external/t_ex_1
$ hdfs dfs -put ./user.data /user/hive/external/t_ex_1
此时,hive表中就可以查询到数据了:
0: jdbc:hive2://Master:10000> select * from db_name.t_ex_1;
+-------------+---------------+--------------+--+
| t_ex_1.id | t_ex_1.name | t_ex_1.age |
+-------------+---------------+--------------+--+
| 1 | xiaowang | 28 |
| 2 | xiaoli | 18 |
| 3 | xiaohong | 23 |
+-------------+---------------+--------------+--+
3 rows selected (8 seconds)
⚠️特别注意:如果删除外部表,hdfs里的文件并不会删除
7. 导入数据 load 命令
本质上来说就是把 数据文件 放到表目录中
hive data [local] inpath '/data/path' [overwrite] into table table_name;
local 表示导入的是本地文件
导入本地文件
load data local inpath '/home/hadoop/user.data'into table table_name;
导入hdfs文件
load data inpath '/user/hive/external/t_test1/user.data' into table table_name';
8. 创建分区表 partitioned by ()
create table t_test1(id int,name string,age int,create_time bigint)
# 根据 日期 和 国家 来分区
partitioned by (day string,country string)
row format delimited
fields terminated by ',';
插入数据到指定分区:
> load data [local] inpath '/data/path1' [overwrite] into table t_test partition(day='2019-06-04',country='China');
> load data [local] inpath '/data/path2' [overwrite] into table t_test partition(day='2019-06-05',country='China');
> load data [local] inpath '/data/path3' [overwrite] into table t_test partition(day='2019-06-04',country='England');
查看表分区信息:
show partitions t_test;
二. 表的查询和连接
0. 准备工作 (准备数据,建表,导入数据)
准备数据
a.txt
a,1
b,2
c,3
d,4
b.txt
b,16
c,17
d,18
e,19
建表 + 导入数据
create table t_a(name string,num int)
row format delimited
fields terminated by ',';
create table t_b (name string,age int)
row format delimited
fields terminated by ',';
load data local inpath '/home/hadoop/a.txt' into table t_a;
load data local inpath '/home/hadoop/b.txt' into table t_b;
表当前数据
0: jdbc:hive2://Master:10000> select * from t_a;
+-----------+----------+--+
| t_a.name | t_a.num |
+-----------+----------+--+
| a | 1 |
| b | 2 |
| c | 3 |
| d | 4 |
+-----------+----------+--+
4 rows selected (0.523 seconds)
0: jdbc:hive2://Master:10000> select * from t_b;
+-----------+----------+--+
| t_b.name | t_b.age |
+-----------+----------+--+
| b | 16 |
| c | 17 |
| d | 18 |
| e | 19 |
+-----------+----------+--+
4 rows selected (0.482 seconds
)
1. 查询 select
select * from table_name where conditions;
2. join关联查询
内连接 t_a join t_b on condition
select a.*,b.* # 结果中展示的数据条目
from
t_a a join t_b b on a.name=b.name; # t_a a 表t_a的别称为a,简化语句
内连接的结果是取 表a 和 表b 的交集
0: jdbc:hive2://Master:10000> select a.*,b.*
0: jdbc:hive2://Master:10000> from
0: jdbc:hive2://Master:10000> t_a a join t_b b on a.name=b.name;
....
+---------+--------+---------+--------+--+
| a.name | a.num | b.name | b.age |
+---------+--------+---------+--------+--+
| b | 2 | b | 16 |
| c | 3 | c | 17 |
| d | 4 | d | 18 |
+---------+--------+---------+--------+--+
左外连接 t_a left outer join t_b on condition
select a.*,b.*
from
t_a a left outer join t_b b on a.name=b.name;
左外连接的结果是 左表中有的所有数据,右表中没有则用 null 替代
0: jdbc:hive2://Master:10000> select a.*,b.*
0: jdbc:hive2://Master:10000> from
0: jdbc:hive2://Master:10000> t_a a left outer join t_b b on a.name=b.name;
...
+---------+--------+---------+--------+--+
| a.name | a.num | b.name | b.age |
+---------+--------+---------+--------+--+
| a | 1 | NULL | NULL |
| b | 2 | b | 16 |
| c | 3 | c | 17 |
| d | 4 | d | 18 |
+---------+--------+---------+--------+--+
右外连接
和左外连接同理,只是左右顺序更改了
全外连接 t_a full outer join t_b on condition
全部算上,空值用null补充
select a.*,b.*
from
t_a a full outer join t_b b on a.name=b.name;
----
+---------+--------+---------+--------+--+
| a.name | a.num | b.name | b.age |
+---------+--------+---------+--------+--+
| a | 1 | NULL | NULL |
| b | 2 | b | 16 |
| c | 3 | c | 17 |
| d | 4 | d | 18 |
| NULL | NULL | e | 19 |
+---------+--------+---------+--------+--+
左半连接 t_a left semi join t_b on condition
求存在于a表,且b表里也存在的数据。
select a.*
from
t_a a left semi join t_b b on a.name=b.name;
----
+---------+--------+---------+--------+--+
| a.name | a.num | b.name | b.age |
+---------+--------+---------+--------+--+
| b | 2 | b | 16 |
| c | 3 | c | 17 |
| d | 4 | d | 18 |
+---------+--------+---------+--------+--+
@ 疑问 hive 中 内连接 和左半连接的区别
inner join和left semi join一般情况下取得的记录行数是相等的,但在R表(右表)中存在重复行时,会出现行数不一致的情况。
当R表存在重复行,使用JOIN ON的时候,L, R表会关联出多条记录,应为ON上的条件符合; 而使用 LEFT SEMI JOIN,符合条件之后就返回,不会再继续查找R表记录了,所以如果R表有重复,也不会产生重复的多条记录。
三. 表的 group by 和 having
0. 准备工作
创建分区表,导入数据:
create table t_pv(ip string,url string,time string)
partitioned by (dt string)
row format delimited
fields terminated by ',';
load data local inpath '/home/hadoop/pv.log.0804' into table t_pv partition(dt='2019-08-04');
load data local inpath '/home/hadoop/pv.log.0805' into table t_pv partition(dt='2019-08-05');
load data local inpath '/home/hadoop/pv.log.0806' into table t_pv partition(dt='2019-08-06');
查看数据:
0: jdbc:hive2://Master:10000> select * from t_pv;
+----------------+------------------------------+----------------------+-------------+--+
| t_pv.ip | t_pv.url | t_pv.time | t_pv.dt |
+----------------+------------------------------+----------------------+-------------+--+
| 192.168.33.3 | http://www.xxx.cn/stu | 2019-08-04 15:30:20 | 2019-08-04 |
| 192.168.33.3 | http://www.xxx.cn/teach | 2019-08-04 15:35:20 | 2019-08-04 |
| 192.168.33.4 | http://www.xxx.cn/stu | 2019-08-04 15:30:20 | 2019-08-04 |
| 192.168.33.4 | http://www.xxx.cn/job | 2019-08-04 16:30:20 | 2019-08-04 |
| 192.168.33.5 | http://www.xxx.cn/job | 2019-08-04 15:40:20 | 2019-08-05 |
| 192.168.33.3 | http://www.xxx.cn/stu | 2019-08-05 15:30:20 | 2019-08-05 |
| 192.168.44.3 | http://www.xxx.cn/teach | 2019-08-05 15:35:20 | 2019-08-05 |
| 192.168.33.44 | http://www.xxx.cn/stu | 2019-08-05 15:30:20 | 2019-08-05 |
| 192.168.33.46 | http://www.xxx.cn/job | 2019-08-05 16:30:20 | 2019-08-05 |
| 192.168.33.55 | http://www.xxx.cn/job | 2019-08-05 15:40:20 | 2019-08-06 |
| 192.168.133.3 | http://www.xxx.cn/register | 2019-08-06 15:30:20 | 2019-08-06 |
| 192.168.111.3 | http://www.xxx.cn/register | 2019-08-06 15:35:20 | 2019-08-06 |
| 192.168.34.44 | http://www.xxx.cn/pay | 2019-08-06 15:30:20 | 2019-08-06 |
| 192.168.33.46 | http://www.xxx.cn/excersize | 2019-08-06 16:30:20 | 2019-08-06 |
| 192.168.33.55 | http://www.xxx.cn/job | 2019-08-06 15:40:20 | 2019-08-06 |
| 192.168.33.46 | http://www.xxx.cn/excersize | 2019-08-06 16:30:20 | 2019-08-06 |
| 192.168.33.25 | http://www.xxx.cn/job | 2019-08-06 15:40:20 | 2019-08-06 |
| 192.168.33.36 | http://www.xxx.cn/excersize | 2019-08-06 16:30:20 | 2019-08-06 |
| 192.168.33.55 | http://www.xxx.cn/job | 2019-08-06 15:40:20 | 2019-08-06 |
+----------------+------------------------------+----------------------+-------------+--+
查看分区表:
0: jdbc:hive2://Master:10000> show partitions t_pv;
+----------------+--+
| partition |
+----------------+--+
| dt=2019-08-04 |
| dt=2019-08-05 |
| dt=2019-08-06 |
+----------------+--+
3 rows selected (0.575 seconds)
1. 求每条url的访问次数 group by
select url ,count(1) as count
from t_pv
group by url;
+------------------------------+------+--+
| url | count|
+------------------------------+------+--+
| http://www.xxx.cn/excersize | 3 |
| http://www.xxx.cn/job | 7 |
| http://www.xxx.cn/pay | 1 |
| http://www.xxx.cn/register | 2 |
| http://www.xxx.cn/stu | 4 |
| http://www.xxx.cn/teach | 2 |
+------------------------------+------+--+
2. 求8月4号以后,每天http://www.xxx.cn/job的总访问次数,及访问者中ip地址中最大的
# where
select dt,url,count(1),max(ip)
from t_pv
where url='http://www.xxx.cn/job'
group by dt,url having dt>'2019-08-04';
# having
select dt,url,count(1),max(ip)
from t_pv
where url='http://www.xxx.cn/job' and dt>'2019-08-04'
group by dt,url;
四. 基本数据类型
数字类型
TINYINT
(1-byte signed integer, from -128 to 127)SMALLINT
(2-byte signed integer, from -32,768 to 32,767)INT/INTEGER
(4-byte signed integer, from -2,147,483,648 to 2,147,483,647)- `BIGINT (8-byte signed integer, from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807)
FLOAT
(4-byte single precision floating point number)DOUBLE
(8-byte double precision floating point number)
日期类型
TIMESTAMP
(Note: Only available starting with Hive 0.8.0)DATE
(Note: Only available starting with Hive 0.12.0)
字符串类型
STRING
VARCHAR
(Note: Only available starting with Hive 0.12.0)CHAR
(Note: Only available starting with Hive 0.13.0)
复合类型 (array< Type >; map< keyType,valueType>; stuct-自定义结构体)
1. array 数组类型
原始数据 actor.dat
玩具总动员4,汤姆·汉克斯:蒂姆·艾伦:安妮·波茨,2019-06-21
流浪地球,屈楚萧:吴京:李光洁:吴孟达,2019-02-05
千与千寻,柊瑠美:入野自由:夏木真理:菅原文太,2019-06-21
战狼2,吴京:弗兰克·格里罗:吴刚:张翰:卢靖姗,2017-08-16
--建表映射:
create table t_movie(movie_name string,actors array<string>,first_show date)
row format delimited fields terminated by ','
collection items terminated by ':';
--导入数据
load data local inpath '/home/hadoop/actor.dat' into table t_movie;
--展示数据
select * from t_movie;
+---------------------+-----------------------------------+---------------------+--+
| t_movie.movie_name | t_movie.actors | t_movie.first_show |
+---------------------+-----------------------------------+---------------------+--+
| 玩具总动员4 | ["汤姆·汉克斯","蒂姆·艾伦","安妮·波茨"] | 2019-06-21 |
| 流浪地球 | ["屈楚萧","吴京","李光洁","吴孟达"] | 2019-02-05 |
| 千与千寻 | ["柊瑠美","入野自由","夏木真理","菅原文太"] | 2019-06-21 |
| 战狼2 | ["吴京","弗兰克·格里罗","吴刚","张翰","卢靖姗"] | 2017-08-16 |
+---------------------+-----------------------------------+---------------------+--+
一些操作:
actors[] – 查询指定列
查询每部电影主演
select movie_name,actors[0],first_show from t_movie;
array_contains – 包函
查询包含’吴京’的电影
select movie_name,actors,first_show
from t_movie where array_contains(actors,'吴京');
size
每部电影查询列出的演员数量
select movie_name
,size(actors) as actor_number
,first_show
from t_movie;
2. map 键值对类型
数据
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26
导入数据
-- 建表映射上述数据
create table t_family(id int,name string,family_members map<string,string>,age int)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';
-- 导入数据
load data local inpath '/root/hivetest/fm.dat' into table t_family;
--展示数据
select * from t_family;
+--------------+----------------+----------------------------------------------------------------+---------------+--+
| t_family.id | t_family.name |t_family.family_members| t_family.age |
+--------------+----------------+----------------------------------------------------------------+---------------+--+
| 1 | zhangsan | {"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"} | 28 |
| 2 | lisi | {"father":"mayun","mother":"huangyi","brother":"guanyu"} | 22 |
| 3 | wangwu | {"father":"wangjianlin","mother":"ruhua","sister":"jingtian"} | 29 |
| 4 | mayun | {"father":"mayongzhen","mother":"angelababy"} | 26 |
+--------------+----------------+----------------------------------------------------------------+---------------+--+
查出每个人的 爸爸、姐妹 [""]
select id,name,family_members["father"] as father,family_members["sister"] as sister,age
from t_family;
查出每个人有哪些亲属关系 - map_keys()
select id,name,map_keys(family_members) as relations,age
from t_family;
查出每个人的亲人数量 size()
select id,name,size(family_members) as relations,age
from t_family;
查出所有拥有兄弟的人及他的兄弟是谁
-- 方案1:一句话写完
select id,name,age,family_members['brother']
from t_family where array_contains(map_keys(family_members),'brother');
-- 方案2:子查询
select id,name,age,family_members['brother']
from
(select id,name,age,map_keys(family_members) as relations,family_members
from t_family) tmp
where array_contains(relations,'brother');
3. stuct 自定义结构体类型
数据
1,zhangsan,18:male:深圳
2,lisi,28:female:北京
3,wangwu,38:male:广州
4,laowang,26:female:上海
5,yangyang,35:male:杭州
导入数据:
-- 建表映射上述数据
drop table if exists t_user;
create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';
-- 导入数据
load data local inpath '/home/hadoop/user.dat' into table t_user;
--展示数据
select * from t_user;
+------------+--------------+----------------------------------------+--+
| t_user.id | t_user.name | t_user.info |
+------------+--------------+----------------------------------------+--+
| 1 | zhangsan | {"age":18,"sex":"male","addr":"深圳"} |
| 2 | lisi | {"age":28,"sex":"female","addr":"北京"} |
| 3 | wangwu | {"age":38,"sex":"male","addr":"广州"} |
| 4 | laowang | {"age":26,"sex":"female","addr":"上海"} |
| 5 | yangyang | {"age":35,"sex":"male","addr":"杭州"} |
+------------+--------------+----------------------------------------+--+
查询每个人的id name和地址
select id,name,info.addr
from t_user;
info.addr — 和面向对象有点类似
五. 常用内置函数
时间处理函数
from_unixtime(21938792183,'yyyy-MM-dd HH:mm:ss')
返回: ‘2017-06-03 17:50:30’
类型转换函数
select cast("8" as int);
select cast("2019-2-3" as data)
字符串截取和拼接
substr("abcde",1,3) --> 'abc'
concat('abc','def') --> 'abcdef'
参考:Hive常用函数的使用: https://juejin.im/post/6844903877326667784