Hive SQL 实用指南-优快云博客

本文链接：https://blog.youkuaiyun.com/cobracanary/article/details/122694588

之前发过如何使用idea连接hive，连接上hive之后肯定是要去使用hive执行一些操作了，这里整理了一些HQL操作。值得注意的是，在我执行hive的时候，有时候会报错[08S01][2] Error while processing statement: FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask。我看了网上的一些说法，说是mr把资源耗尽，具体啥bug我也不想深究，毕竟也不是运维，但是我亲身实践过重启一次hive就好了。

hive是一个数据仓库工具，能将结构化的文件映射成表，病提供SQL的方式供开发人员运行OLAP任务，用户写的SQL在底层会转换为map-reduce任务得到结果。

/*----------------------------------DDL语法--------------------------------------*/

//创建数据库
create database if not exists myhive;

//在hdfs指定位置创建数据库
create database myhive2 location '/myhive2';

// 描述数据库
desc  database  myhive2;

// 列出所有数据库
show databases;

// 详细描述数据库
desc database extended  myhive2;

// 使用某个数据库，先use才能操作这个数据库下的表
use myhive;

// 创建表
create table stu(id int,name string);

// 在表中插入记录
insert into stu values (1,"zhangsan");

// 在表中插入多个记录
insert into stu values (1,"zhangsan"),(2,"lisi");

// 查询出表中所有的值
select * from stu;

// 创建表，值以'\t'结束，存储在hdfs上的/usr/stu2
create table if not exists
    stu2(id int ,name string)
    row format delimited
    fields terminated by '\t'
    stored as textfile location '/user/stu2';

// 描述数据库
desc formatted  stu2;

// 展示数据库建库语句
show create table stu2;

// 创建外部表
create external table
    ext_stu (s_id string,s_name string)
    row format delimited
    fields terminated by '\t';

// 将本地路径上的文件加载到表里面，等于将本地这个文件上传到hdfs中表对应的文件夹中
load data local inpath '/home/wxwmd/1.txt' into table ext_stu;

// 将hdfs中的数据加载到表中
load data inpath '/tmp/1.txt' into table ext_stu;

// 删除表
drop table stu2;

// 创建分区表
create table score(s_id string, s_score int) partitioned by (month string);

// 修改表的属性
ALTER TABLE score SET SERDEPROPERTIES ('field.delim' = '\t' , 'serialization.format'='\t');

// 加载数据到分区表中，分区为202201
load data local inpath '/home/wxwmd/score.txt' into table score partition (month='2022101');

// 再加载一份数据到202202分区中
load data local inpath '/home/wxwmd/score.txt' into table score partition (month='2022102');

// 查看分区
show  partitions  score;

// 新建分区
alter table score add partition(month='202203') partition(month = '202204');

// 删除分区
alter table score drop partition (month='202203') ;
alter table score drop partition (month='202204') ;

// 分区表的使用，使用分区进行查询
select * from score where month='2022101';

// 修改hive执行引擎
set hive.execution.engine=mr;

// 直接向分区表中写入数据
insert into table score partition(month ='2022101') values ('wll',100);

/*
在创建表的时候指定映射的文件加位置
在此文件夹下面的所有文件都会自动加载到表中，比方说location '/A'，那么'/A/1.txt'会被加载
但是我实验发现不会递归解析文件夹，比方说location '/A'，那么'/A/B/1.txt'不会被加载
*/
create external table
    score2 (s_id string,s_score int)
    row format delimited
    fields terminated by '\t'
    location '/score/202201';

// 开启hive的分桶功能
set hive.enforce.bucketing=true;

// 设置reduce任务个数，因为分桶实际上是分成了数个reduce任务，每个reduce任务输出到一个文件当中
set mapreduce.job.reduces=3;

// 创建分桶表
create table bucket_stu (s_id int,s_name string) clustered by(s_id) into 3 buckets;

// 加载数据
insert overwrite table bucket_stu select * from ext_stu cluster by(s_id);  -- 最后指定桶字段

// 导出hive表数据到hdfs上面
export table ext_stu to  '/export/ext_stu';

// 将查询的结果格式化导出到本地
insert overwrite
    local directory '/home/wxwmd/export/ext_stu'
    row format delimited
    fields terminated by '\t'
    collection items terminated by '#'
select * from ext_stu;

// 将查询的结果导出到HDFS上(没有local)
insert overwrite
    directory '/export/score'
    row format delimited
    fields terminated by '\t'
    collection items terminated by '#'
select * from score;

/*----------------------------------------DQL----------------------------------------------*/
// 最简单的where语句
select * from ext_stu where s_id=1;

// group by语句
select s_id,avg(s_score) from score group by s_id;

// group by，加上hiving进行过滤
select s_id,sum(s_score) SumScore from score group by s_id having SumScore > 120;

/*
join操作
join(inner join)：内连接，只有满足条件的行会被返回
left join：左连接，左边表的全部行被保留，右边如果没有匹配的数据则置为空
right join：右连接，右边表的全部行被保留，左边如果没有匹配的数据则置为空
full join：全连接（外连接），全部行被返回，没有匹配的数据置为空
*/
select * from score join ext_stu es on score.s_id = es.s_name;

/*
order by
asc升序
desc降序
*/
select s_id,sum(s_score) SumScore from score group by s_id order by SumScore desc;

/*
order by 全局排序
sort by 局部排序
在每个reduce任务内进行排序
*/
// 设置reduce任务个数
set mapreduce.job.reduces=3;
//查看设置reduce个数
set mapreduce.job.reduces;
// sort by局部排序
select s_id,s_score from score sort by s_score desc;

/*
distribute by将数据进行分区
比方说将成绩按照班级号分区，再进行AVG，sort等任务
*/
select s_id,s_score from score distribute by s_id sort by s_score;

/*
当distribute by和sort by字段相同时，可以使用cluster by方式.
cluster by除了具有distribute by的功能外还兼具sort by的功能。但是排序只能是正序排序，不能指定排序规则为ASC或者DESC。
即
select * from score cluster by s_id;
==
select * from score distribute by s_id sort by s_id;
*/

/*-------------------------------------------HIVE函数-----------------------------------------------------*/
/*
现在不学了，反正学了也得忘
开摆
*/