hive-study

最新推荐文章于 2024-04-11 17:10:37 发布

原创最新推荐文章于 2024-04-11 17:10:37 发布 · 177 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#hive #hadoop #大数据

文章介绍了如何在Hive中创建表，包括普通表和分区表，以及使用`LOADDATA`、`INSERTINTO`命令加载数据。还涉及到了数据导出、导入以及使用`SHOWTABLES`、`SHOWFUNCTIONS`等查询元数据的功能。此外，讨论了Hive的动态分区和分桶表的概念，强调了列式存储格式如ORC和Parquet对于性能的提升。

show tables;

--创建表
create table teacher(
    name string,
    friends array<string>,
    students map<string,int>,
    address struct<city:string,street:string,postal_code:int>
)
location '/user/hive/warehouse/teacher';

select friends[0],students['xiaohaihai'] from teacher;

select address.city from teacher;

--加载数据
-- local  表示从本地加载数据到hive表，否则表示从HDFS加载数据到hive表
-- overwrite  表示覆盖表中的已有数据，否则表示追加
-- partition  表示上传到指定分区，如果目标是分区表，需指定分区
load data local inpath '/opt/module/datas/student.txt' [overwrite] into table student;


-- insert into table student ...  追加
-- insert overwrite table student ...   覆盖

--export导出语句可以将表的数据和元数据信息一并导出到HDFS路径
export table teacher to 'target_path';
--import 可以将export导出的内容导入到hive

--import和export可以用于两个hive之间的数据迁移
import [external] table tablename from 'source_path' [location 'itarget_path(导入表的存储路径)']

--查看系统内置函数
show functions [like "*string*"];
--查看内置函数upper用法
desc function [extended] upper;

--substring(str, pos[, len])
--replace 替换
--regexp_replace 正则替换



----分区表
--分区表就是将一张大表按照业务分散的存储到多个目录。每个目录称为该表的一个分区。
--查询时通过where子句中的表达式选择需要查询的分区
create table dept_partition(
    dept_no int,
    dname string,
    loc string
)
partitioned by (day string)
row format delimited fields terminated by '\t';

--load数据
load data local inpath '/opt/module/hive/datas/dept_20220401.log'
    into table dept_partition partition(day='20220401');
--insert数据
insert into[/overwrite] table dept_partition partition(day='20220402')
select dept_no,dname,loc from dept_partition where day='20220401';

--查看表的分区
show partitions dept_partition;
--创建分区
alter table dept_partition add partition (day='20230615');
alter table dept_partition add partition(day='20230615') partition(day='20230616');
--删除分区
alter table dept_partition drop partition (day='20230615');
alter table dept_partition add partition(day='20230615'),partition(day='20230616');
--修复分区  metastore check   都是修改元数据 为了与hdfs路径对应
msck repair table table_name [add/frop/sync partitions]; --add增加hdfs路径存在但是无元数据的  drop 删除  sync同时执行上述两个命令
msck repair table table_name; === msck repair table table_name add partitions;

--二级分区  ...三级分区 四级分区
create table dept_partition(
        dept_no int,
        dname string,
        loc string
)
    partitioned by (day string, hour string)
    row format delimited fields terminated by '\t';
--其他一致

--动态分区  是指向分区表insert数据时，写入哪个分区不由用户指定，而是由每行数据的最后一个字段值动态决定
--使用动态分区可以使用一个insert语句将数据写入多个分区


---分桶表
--分区针对的是数据的存储路径  分桶针对的是数据文件
--原理：为每一行数据计算一个指定字段的hash值，模分桶，最后将取模运算结果相同的行写入同一个文件， 这个文件称为 分桶 bucket
create table stu_buck(
        id int,
        name string
)
    clustered by (id)
    into 4 buckets
    row format delimited fields terminated by '\t';

--分桶排序表
create table stu_buck(
        id int,
        name string
)
    clustered by (id) sorted by (id)
    into 4 buckets
    row format delimited fields terminated by '\t';

--压缩格式  底层为Hadoop  所以hive压缩格式与hadoop保持一致

--Hive中文件格式  包括：text file 、orc 、parquet 、 sequence file

--text file  文本文件是默认使用的  文本文件中的一行对应hive表中的一行记录   行式存储
create table textfile_table(col ...)
stored as textfile ;
--ORC  列式存储的文件格式  能够提高读写数据和处理数据的性能   列式存储
create table textfile_table(col ...)
stored as orc
tblproperties (property_name = property_value,...);
--parquet 列式存储文件格式