Apache Hive SQL-入门快速上手-学习清单

最新推荐文章于 2025-03-10 00:01:38 发布

hulnnn

最新推荐文章于 2025-03-10 00:01:38 发布

阅读量1.3k

点赞数 17

文章标签：大数据 hive sql

本文链接：https://blog.youkuaiyun.com/hulknnn/article/details/137170035

版权

本文详细介绍了Apache Hive SQL的使用，包括进入Hive流程、DDL建表、数据类型、表元数据查看等。还阐述了内部表与外部表区别、分区表和分桶表的创建与使用，以及数据库、表结构、分区的DDL操作，还有查询、加载、插入等操作。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Apache Hive SQL

进入Hive流程

启动matastore服务 --脚本
启动hiveserver2服务 --脚本
检测 --jps (两个RunJar 启动)

beeline客户端连接--

[root@node1 ~]# /export/server/hive-3.1.2/bin/beeline
beeline> ! connect jdbc:hive2://node1:10000
beeline> root
beeline> 直接回车
0: jdbc:hive2://node1:10000> show databases;
+----------------+
| database_name  |
+----------------+
| default        |
+----------------+
1 row selected (1.242 seconds)

Hive SQL --DDL--建表

create table 表名

creat table if not exists 表名 --加上if not exists 忽略异常

Hive的数据类型

Hive 支持SQL类型外,还支持java数据类型,还支持复合类型(array数组 map映射)

在建表的时候,最好表的字段类型要和文件中的类型保持一致(如果不一致,Hive 会尝试进行类型隐式转换,不保证转换成功,不成功会显示null值)

--案例
--创建数据库并切换使用
create database if not exists itheima;
use itheima;

--建表
create table t_archer(
      id int comment "ID",
      name string comment "英雄名称",--comment 说明
      hp_max int comment "最大生命",
      mp_max int comment "最大法力",
      attack_max int comment "最高物攻",
      defense_max int comment "最大物防",
      attack_range string comment "攻击范围",
      role_main string comment "主要定位",
      role_assist string comment "次要定位"
) comment "王者荣耀射手信息"
row format delimited --行格式分隔
fields terminated by "\t";--字段终止与

comment 说明

row 行;format 格式;delimited 限定

fields 字段;terminated 终止

查看表的元数据信息

desc formatted 表名

--查看表数据 查看表的元数据信息
select * from t_archer;
desc formatted t_archer;

formatted 格式化

上传文件到表对应的HDFS目录下

-- todo 加载数据 方式一 local inpath 从linux中加载, 复制效果
load data local inpath '/root/data/01_archer.txt' into table db_2.tb_archer;

-- todo 加载数据 方式二 inpath 从 hdfs中加载, 剪切效果,
load data inpath '/67_archer.txt' into table db_2.tb_archer;
-- todo 注意: cdh(理解 hadoop的发行版)中 只支持 load data inpath

-- todo 加载数据 方式三
[root@node1 data]# hdfs dfs -put 01_archer.txt /user/hive/warehouse/db_2.db/tb_archer

Hive读写HDFS上文件

--创建表 --表后面增加字段之间的分隔符或指定map类型kv之间的分隔符--上传文件

分割符

ROW FORMAT DELIMITED具体的子语法

row format delimited 表示使用LazySimpleSerDe类进行序列化解析数据

fields terminated by ',' 指定字段之间的分隔符 collection items terminated by '-' 指定集合元素之间的分隔符 map keys terminated by ':' 指定map类型kv之间的分隔符

默认分隔符 \001默认分隔符
--案例:
--第一步 建表
create table t_hot_hero_skin_price(
      id int,
      name string,
      win_rate int,
      skin_price map<string,int>
)
row format delimited
fields terminated by ',' --字段之间分隔符
collection items terminated by '-'  --集合元素之间分隔符
map keys terminated by ':'; --集合元素kv之间分隔符;
--第二步 上传数据
--方式一 linux上操作 上传文件
hadoop fs -put hot_hero_skin_price.txt /user/hive/warehouse/itheima.db/t_hot_hero_skin_price ;
--方式二 hive本地加载Linux文件
load data local inpath '/root/data/hot_hero_skin_price.txt' into table t_hot_hero_skin_price ;

内部表,外部表

表的组成:元数据+数据文件

内部表:删除的时候,既要删除元数据,又要删除数据文件,应用场景:部门内部操作的文件.

外部表:删除的时候,只删除元数据,数据文件本身不删除,应用场景:多个不同部分共享的文件

--创建内部表
create table student_inner(
    Sno int,
    Sname string,
    Sex string,
    Sage int,
    Sdept string) 
    row format delimited 
    fields terminated by ',';

--创建外部表关键字external--

--创建外部表 关键字--external--
create external table student_external(
    Sno int,Sname string,Sex string,Sage int,Sdept string) 
    row format delimited 
    fields terminated by ',';

--内部表在删除的时候元数据和数据都会被删除

--外部表在删除的时候只删除元数据而HDFS上的数据文件不会动

外部表有什么好处最大的好处是防止误操作删除表的时候把表的数据一起删除.

可以通过desc formatted table_name;去查询表的元数据信息获取表的类型

desc formatted table_name;

MANAGED_TABLE     内部表、受控表
EXTERNAL_TABLE    外部表

表数据在HDFS上储存路径

储存路径由 hive.metastore.warehouse.dir 属性指定.默认值是:/user/hive/warehouse

不管是内部表，还是外部表，在HDFS上的路径如下：
/user/hive/warehouse/itcast.db/t_array
/user/hive/warehouse/数据库名.db/表名

Hive的分区表

-创建表(partitioned by(**))-分区表的数据加载(静态分区加载/动态分区加载)-分区表使用

分区表的创建

partitioned by()

--分区表建表 
create table t_all_hero_part(
   id int,
   name string,
   hp_max int,
   mp_max int,
   attack_max int,
   defense_max int,
   attack_range string,
   role_main string,
   role_assist string
) 
partitioned by (role string)--注意哦 这里是分区字段 分区字段不能与表字段重复
row format delimited
fields terminated by "\t";

静态分区表的数据加载

--静态加载分区表数据
load data local inpath '/root/hivedata/archer.txt' into table t_all_hero_part partition(role='sheshou');
load data local inpath '/root/hivedata/assassin.txt' into table t_all_hero_part partition(role='cike');
load data local inpath '/root/hivedata/mage.txt' into table t_all_hero_part partition(role='fashi');
load data local inpath '/root/hivedata/support.txt' into table t_all_hero_part partition(role='fuzhu');
load data local inpath '/root/hivedata/tank.txt' into table t_all_hero_part partition(role='tanke');
load data local inpath '/root/hivedata/warrior.txt' into table t_all_hero_part partition(role='zhanshi');

--查询一下验证是否加载成功
select * from t_all_hero_part;

动态分区加载

设置允许动态分区、设置动态分区模式

--动态分区
set hive.exec.dynamic.partition=true; --注意hive3已经默认开启了
set hive.exec.dynamic.partition.mode=nonstrict;

--模式分为strict严格模式  nonstrict非严格模式
严格模式要求 分区字段中至少有一个分区是静态分区。

动态分区加载数据

格式:insert into table 加载的表名 partition(分区名) select *,原表.字段名 from 原表名

插入的数据来自于后面的查询语句返回的结果。

查询返回的内容，其字段类型、顺序、个数要和待插入的表保持一致。

--创建一张新的分区表 t_all_hero_part_dynamic
create table t_all_hero_part_dynamic(
    id int,
    name string,
    hp_max int,
    mp_max int,
    attack_max int,
    defense_max int,
    attack_range string,
    role_main string,
    role_assist string
) partitioned by (role string)
row format delimited
fields terminated by "\t";

--执行动态分区插入  --注意 分区值并没有手动写死指定
insert into table --insert into table
t_all_hero_part_dynamic --向那张表加载 
partition(role) --partition(分区名字)
select tmp.*,--select *
tmp.role_main  --被加载表名.对应字段
from t_all_hero tmp;--from 被加载的表名

--查询验证结果
select * from t_all_hero_part_dynamic;

分区表的使用

--非分区表 全表扫描过滤查询
select count(*) from t_all_hero where role_main="archer" and hp_max >6000;

--分区表 先基于分区过滤 再查询
select count(*) from t_all_hero_part where role="sheshou" and hp_max >6000;

分区表注意事项

分区表的字段不能是表中已有的字段
以文件夹管理不同的文件
创建分区表之后，使用分区字段查询可以减少全表扫描，提高查询的效率

多重分区表

多重分区表之间是一种递进关系,可以理解为前一个分区的基础上继续分区 ,常见的多分区就是2个分区.

--创建表
create table 
t_user_double_p(id int,name string,country string) 
partitioned by --固定格式
(guojia string,sheng string) --分区1 类型,分区2 类型
row format delimited 
fields terminated by  ',';

--加载数据到多分区表中
load data local inpath --本地加载固定格式
'/root/hivedata/china_sh.txt'  --文件地址
into table t_user_double_p --
partition(guojia="zhongguo",sheng="shanghai");--partition(分区1='',分区2='')

load data local inpath '/root/hivedata/china_sz.txt'  into table t_user_double_p partition(guojia="zhongguo",sheng="shenzhen");

load data local inpath '/root/hivedata/usa_dezhou.txt'  into table t_user_double_p partition(guojia="meiguo",sheng="dezhou");

--查询来自于中国深圳的用户有哪些？
select * from t_user_double_p where guojia="zhongguo"and sheng="shenzhen";

Hive分桶表

从语法层面解析分桶含义

CLUSTERED BY xxx INTO N BUCKETS
--根据xxx字段把数据分成N桶
--根据表中的字段把数据文件成为N个部分

t_user(id int,name string);
--1、根据谁分？
 CLUSTERED BY xxx ；  xxx必须是表中的字段
--2、分成几桶？
 N BUCKETS   ；N的值就是分桶的个数
--3、分桶的规则？
clustered by id into 3 bucket

hashfunc(分桶字段)  %  N bucket  余数相同的来到同一个桶中
1、如果分桶的字段是数字类型的字段，hashfunc(分桶字段)=分桶字段本身
2、如果分桶的字段是字符串或者其他字段，hashfunc(分桶字段) = 分桶字段.hashcode

分桶表创建

--创建表
--根据state州分为5桶 每个桶内根据cases确诊病例数倒序排序
CREATE TABLE itheima.t_usa_covid19_bucket_sort(
     count_date string,
     county string,
     state string,
     fips int,
     cases int,
     deaths int)
CLUSTERED BY(state)--根据(state)进行分桶
sorted by (cases desc) --根据(cases 倒序)
INTO 5 BUCKETS;--into 分 5 桶

分桶表的数据加载

--step1:开启分桶的功能 从Hive2.0开始不再需要设置
set hive.enforce.bucketing=true;

--step2:把源数据加载到普通hive表中
CREATE TABLE itheima.t_usa_covid19(
       count_date string,
       county string,
       state string,
       fips int,
       cases int,
       deaths int)
row format delimited fields terminated by ",";

--将源数据上传到HDFS，t_usa_covid19表对应的路径下
hadoop fs -put us-covid19-counties.dat /user/hive/warehouse/itheima.db/t_usa_covid19

--step3:使用insert+select语法将数据加载到分桶表中
insert into t_usa_covid19_bucket select * from t_usa_covid19;

select * from t_usa_covid19_bucket;

分桶表的使用

--基于分桶字段state查询来自于New York州的数据
--不再需要进行全表扫描过滤
--根据分桶的规则hash_function(New York) mod 5计算出分桶编号
--查询指定分桶里面的数据 就可以找出结果  此时是分桶扫描而不是全表扫描
select *
from t_usa_covid19_bucket where state="New York";

分桶的总结

分桶表也是一种优化表，可以减少join查询时笛卡尔积的数量、提高抽样查询的效率。
分桶表的字段必须是表中已有的字段；
分桶表需要使用间接的方式才能把数据加载进入：insert+select
在join的时候，针对join的字段进行分桶，可以提高join的效率减少笛卡尔积数量。

Hive SQL--DDL其他操作

Database 数据库 DDL操作

建库

--建库
create database if not exists db_7
comment '电商库' --commont注释名字
with dbproperties('createdBy'='zhangsan')--设置键值对属性值;

看数据库属性

-- 查看库的描述信息
desc database db_7;

-- 查看库的详细描述信息
desc database extended db_7;
-- 查看库的详细描述信息 desc database extended db_7;

更改数据库属性

--更改数据库属性
alter database db_7 set dbproperties ('createBy'='list_1')

更改数据库所有者

alter database db_7 set owner user user_1;

更改数据库位置

alter database db_7 set location  'hdfs://node1:8020/bj_67';

表结构的操作

删除表

drop table if exists db_7.tb_student;

查看表的详细信息

-- todo 2 查看建表语句
show create table db_4.t_user;

更改表的注释

-- todo 3 更改表的注释
alter table db_4.t_user 
set tblproperties('comment'='this is qq info table');
show create table db_4.t_user;

更改表的存储位置

alter table db_4.t_user 
set location '/test/data/t_user_2';

更改列的名称/类型

CREATE TABLE t1_change (a int, b int, c int);
alter table t1_change change b b1 string;

Partition 分区 DDL操作

比较重要的是增加分区，删除分区操作

增加分区--创建分区表--load加载

--创建数据库
create database if not exists db_8;
use db_8;
--创建 含有分区的表
create table db_8.tb_student_partition
(
    id   int,
    name string,
    sex  string,
    age  int,
    dept string
)
partitioned by (dt string)
row format delimited fields terminated by ',';
--linux 本地加载数据 partition 数据
load data local inpath '/root/data/students.txt' into table db_8.tb_student_partition partition (dt='2023-01-01');

--在linux系统添加分区 向分区内 上传表
hdfs dfs -mkdir -p  /user/hive/warehouse/db_8.db/tb_student_partition/dt=2023-01-02

hdfs dfs -put /root/data/04_students.txt /user/hive/warehouse/db_8.db/tb_student_partition/dt=2023-01-02

增加分区表

--在现有的分区表,增加新分区
alter table db_8.tb_student_partition 
add partition (dt='2023-01-03');

查看表内的分区

--查看表内的分区
show partitions db_8.tb_student_partition;

修改表内分区

--修改分区
alter table db_8.tb_student_partition partition (dt='2023-01-03') rename to partition (dt='2023-06-01');

删除表内分区

--删除分区
alter table db_8.tb_student_partition drop partition (dt='2023-06-01');

查询

--1、显示所有数据库 SCHEMAS和DATABASES的用法 功能一样
show databases;
show schemas;

--2、显示当前数据库所有表/视图/物化视图/分区/索引
show tables;
SHOW TABLES [IN database_name]; --指定某个数据库

--3、显示当前数据库下所有视图
--视图相当于没有数据临时表  虚拟表
Show Views;
SHOW VIEWS 'test_*'; -- show all views that start with "test_"
SHOW VIEWS FROM test1; -- show views from database test1
SHOW VIEWS [IN/FROM database_name];

--4、显示当前数据库下所有物化视图
SHOW MATERIALIZED VIEWS [IN/FROM database_name];

--5、显示表分区信息，分区按字母顺序列出，不是分区表执行该语句会报错
show partitions table_name;
show partitions itheima.student_partition;

show partitions db_8.tb_student_partition;

--6、显示表/分区的扩展信息
SHOW TABLE EXTENDED [IN|FROM database_name] LIKE table_name;
show table extended like student;
describe formatted itheima.student;

desc formatted db_8.tb_student_partition;

--7、显示表的属性信息
SHOW TBLPROPERTIES table_name;
show tblproperties student;

--8、显示表、视图的创建语句
SHOW CREATE TABLE ([db_name.]table_name|view_name);
show create table student;

show create table db_8.tb_student_partition;

--9、显示表中的所有列，包括分区列。
SHOW COLUMNS (FROM|IN) table_name [(FROM|IN) db_name];
show columns  in student;

show columns in db_8.tb_student_partition;
desc db_8.tb_student_partition;

--10、显示当前支持的所有自定义和内置的函数
show functions;

--11、Describe desc
--查看表信息
desc extended table_name;
--查看表信息（格式化美观）
desc formatted table_name;
--查看数据库相关信息
describe database database_name;

load 重点

create table db_9.tb_student_linux(
    id int,
    name string,
    sex string,
    age int,
    dept string
)
row format delimited fields terminated by ','
;
-- 目标1: 使用load 从 linux的文件 加载数据
load data local inpath '/root/data/students.txt' into table db_9.tb_student_linux;
select * from tb_student_linux;

create table db_9.tb_student_hdfs(
    id int,
    name string,
    sex string,
    age int,
    dept string
)
row format delimited fields terminated by ',';
-- 目标2: 使用load 从 hdfs的文件 加载数据
hdfs dfs -put /root/data/04_students.txt /bj_67

load data inpath '/bj_67/students.txt' into table db_9.tb_student_hdfs;

select * from tb_student_hdfs;

create table db_9.tb_student_linux_partition(
    id int,
    name string,
    sex string,
    age int,
    dept string
)
partitioned by (dt string)
row format delimited fields terminated by ',';
-- 目标3: 使用load 向分区表 加载数据
load data local inpath '/root/data/04_students.txt' into table tb_student_linux_partition partition(dt='2023-01-01');
select * from tb_student_linux_partition;

load data local inpath '/root/data/04_students.txt' into table tb_student_linux_partition partition(dt='2023-01-01');
select * from tb_student_linux_partition;

load data local inpath '/root/data/04_students.txt' overwrite into table tb_student_linux_partition partition(dt='2023-01-01');
select * from tb_student_linux_partition;

覆盖效果 overwrite

-- 目标4: 覆写效果
-- todo 问题: 覆写 就旧的都删掉, 将新的放进去 对不对?
load data local inpath '/root/data/04_students.txt' into table tb_student_linux_partition partition(dt='2023-01-01');
load data local inpath '/root/data/04_students.txt' into table tb_student_linux_partition partition(dt='2023-01-02');
load data local inpath '/root/data/04_students.txt' into table tb_student_linux_partition partition(dt='2023-01-03');

load data local inpath '/root/data/04_students.txt' overwrite into table tb_student_linux_partition partition(dt='2023-01-01');

insert_select

-- 准备工作
create database if not exists db_10;
use db_10;

create table db_10.tb_student(
    id int,
    name string,
    sex string,
    age int,
    dept string
)
row format delimited fields terminated by ',';

load data local inpath '/root/data/04_students.txt' into table db_10.tb_student;

create table db_10.tb_student_2(
    id int,
    name string,
    sex string,
    age int,
    dept string
)
row format delimited fields terminated by ',';

drop table if exists db_10.tb_student_partition;

create table db_10.tb_student_partition(
    id int,
    name string,
    sex string,
    age int,
    dept string
)
partitioned by (dept_info string)
row format delimited fields terminated by ',';

select * from tb_student;

-- 目标1 使用 insert + select 向普通表插入数据
insert into db_10.tb_student_2
select * from tb_student
where dept in ('IS', 'CS');

select * from tb_student_2;

-- 目标2 使用 insert + select 向分区表插入数据
insert into db_10.tb_student_partition partition(dept_info)
select
    id,
    name,
    sex,
    age,
    dept,
    dept as dept_info
from tb_student
where dept in ('IS', 'CS');

-- 注意: 分区列 应该放到 普通列的后面 且 跟建表的列的顺序保持一致

select * from tb_student_partition;

-- 目标3 使用 insert overwrite + select 向普通表插入数据
insert overwrite table db_10.tb_student_2
select * from tb_student
where age>18;

-- 目标4 使用 insert overwrite + select 向分区表插入数据
set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table db_10.tb_student_partition partition(dept_info)
select
    id,
    name,
    sex,
    age,
    dept,
    dept as dept_info
from tb_student
where dept in ('IS', 'MA') and age>20;

多重插入

-- todo 目标: 多重插入
-- todo 1 创建表1
create table db_10.tb_student_3(
    id int,
    name string
);
-- todo 2 创建表2
create table db_10.tb_student_4(
    id int,
    dept string
);

-- todo 3 分解式 插入数据
insert into db_10.tb_student_3
select
    id,
    name
from db_10.tb_student;

insert into db_10.tb_student_4
select
    id,
    dept
from db_10.tb_student;

-- todo 问题: 多次插入 都需要 扫描同一个表?
-- todo 解决: 多次插入 只需要 扫描一次

-- todo 清空表的内容
truncate table db_10.tb_student_3;
truncate table db_10.tb_student_4;

-- todo 4 合并式 插入数据
from db_10.tb_student
insert into db_10.tb_student_3
select
    id,
    name
insert into db_10.tb_student_4
select
    id,
    dept;

导出

-- todo 目标: 导出
select * from db_10.tb_student where sex='女';
-- todo 1 将内容导出到 hdfs中 且 不指定分隔符
insert overwrite directory '/export_data/s1'
select * from db_10.tb_student where sex='女';

-- todo 2 将内容导出到 hdfs中 且 指定分隔符
insert overwrite directory '/export_data/s2'
    row format delimited fields terminated by '*'
select * from db_10.tb_student where sex='男';
-- todo 3 将内容导出到 本地linux中 且 指定分隔符
insert overwrite local directory '/export_data/s3'
    row format delimited fields terminated by '*'
select * from db_10.tb_student where age>18;

-- todo 4 将内容导出到 本地linux中 且 指定分隔符 且 指定默认的存储方式
insert overwrite local directory '/export_data/s4'
    row format delimited fields terminated by '*'
    stored as orc
select * from db_10.tb_student where age>18;

分页

分页的语法: limit x,y    x表示 从哪个下标开始, 从0开始,  y表示 长度

分组

group by

分桶查询

-- 目标: 分桶查询
-- todo 方式一: cluster by xxx 含义: 根据指定列分 且 按照指定列 正序
-- 默认情况下, 将结果放到一个文件中
insert overwrite local directory '/root/export_data/s1'
select * from db_10.tb_student cluster by id;

-- 默认情况下, 将结果放到二个文件中
set mapreduce.job.reduces;
set mapreduce.job.reduces = 2; -- todo 如果想分成多部分, 这个是必须得

insert overwrite local directory '/root/export_data/s2'
select * from db_10.tb_student cluster by id;
-- todo 需求: 根据指定列分 且 按照指定列 正序
-- todo 方式二 cluster by id 等价于 distribute by id sort by id asc
insert overwrite local directory '/root/export_data/s3'
select * from db_10.tb_student
distribute by id sort by id asc;

-- todo 需求: 根据指定列分 且 按照指定列 降序
insert overwrite local directory '/root/export_data/s4'
select * from db_10.tb_student
distribute by id sort by id desc;

-- todo 需求: 根据id分 且 按照年龄 降序
insert overwrite local directory '/root/export_data/s5'
select * from db_10.tb_student
distribute by id sort by age desc;

union 上下表合并

-- todo 目标: union的用法
-- todo 需求1: 1 查询大于18岁的人 2 查询大于20岁的人 3 将1和2整合到一起
select * from db_10.tb_student where age>18; -- 15人
select * from db_10.tb_student where age>20; -- 5人

-- union all 不会去重
select * from db_10.tb_student where age>18
union all
select * from db_10.tb_student where age>20;

-- union distinct 去重
select * from db_10.tb_student where age>18
union distinct
select * from db_10.tb_student where age>20;

-- union 去重 和 不去重? 去重 union 等价于 union distinct
select * from db_10.tb_student where age>18
union
select * from db_10.tb_student where age>20;

-- union 注意事项: 上下的结果 列的数量和类型 必须一致

cte的用法

-- todo 目标: cte用法 (代替 子查询)
-- todo 需求: 查询 部门为 IS 且 年龄 > 18 的 男生和女生各多少人
-- todo 方式一: 采用子查询
select * from db_10.tb_student where dept='IS';

select * from (select * from db_10.tb_student where dept='IS') t1
where age>18;

select
    sex,
    count(1) as cnt
from (select * from (select * from db_10.tb_student
                     where dept='IS') t1
      where age>18) t2
group by sex;
-- todo 需求: 查询 部门为 IS 且 年龄 > 18 的 男生和女生各多少人
-- todo 方式二: 采用 CTE
with t1 as (
    select *
    from db_10.tb_student
    where dept='IS'
),
t2 as (
    select *
    from t1
    where age>18
)
select
    sex,
    count(1) as cnt
from t2
group by sex;