Hive、Impala、Spark DDL DML SQL

最新推荐文章于 2024-02-01 21:25:30 发布

沧海寄馀生

最新推荐文章于 2024-02-01 21:25:30 发布

阅读量347

点赞数

文章标签： hive spark sql hadoop 数据仓库

本文链接：https://blog.youkuaiyun.com/sx157559322/article/details/132288818

版权

分享一下Hive、Impala、Spark 常见的 DDL、DML语句。

测试版本

Hive 2.1.1+cdh6.2.1

Impala 3.2.0+cdh6.2.1

spark-3.3.1 kyuubi-1.6.1

以下SQL均可编辑成sql文件，使用beeline ... -f xxx.sql执行

Hive DDL DML

--HIVE SQL DDL DMLsql
--##DDL数据库相关操作
--1、create
--如果不存在则创建test数据库,这里SCHEMA关键字同DATABASE一致,下面例子会交错使用,但效果都是一样的
create database if not exists test;
create schema if not exists test2;
--创建数据库的同时,还可以指定其他配置
create database  IF NOT EXISTS test3
COMMENT "测试数据库描述"
LOCATION "/user/hive/test3"
WITH DBPROPERTIES ("name"="testvalues");


create database if not exists test4;


--注:MANAGEDLOCATION关键字在hive4.0.0版本中才可以使用


--2、drop
--如果存在则删除test2删除数据库
drop database if exists test4;
--RESTRICT和CASCADE,
--restrict 是默认行为 即 drop database if exists test2;等同于 drop database if exists test2 restrict;
drop database if exists test4 restrict;
--当库下含有表即不为空的情况下仍然要删除库那么需要使用CASCADE关键字
drop database if exists test2 cascade;


--3、alter
--设置数据库属性
alter database test set dbproperties ("name"="test values");
--同时你也可以修改库的OWNER、LOCATION等语法如下
--ALTER SCHEMA test3 SET OWNER user test_user; 
ALTER DATABASE test3 SET LOCATION "hdfs://nameservice1/user/hive/test33";




--4、use切换库
use test;
--切换到默认库
USE DEFAULT;
--查看当前所在库
SELECT current_database();


--##DDL表相关操作
--1、create
--对于增加了external关键字称为外部表,没有则为内部表。
--创建test.t1为内部表
drop table if exists test.t1;
CREATE  TABLE IF NOT EXISTS test.t1
(id int,
f1 string,
f2 string
);
--创建test.t2外部表,并且指定了描述、分隔符、存储格式、hdfs location关键字
drop table if exists test.t2;
create external table if not exists test.t2
(
id int,
f1 string,
f2 string
)
COMMENT "test table"
row format delimited fields terminated by ',' STORED AS PARQUET
LOCATION '/user/hive/warehouse/test.db/t2';


drop table if exists test.t3;
--创建内部分区表
create table test.t3 (
  id   int,
  f1   string,
  f2   string
)
partitioned by (day string);


--再建一个分区表方便后续测试
drop table if exists test.t33;
create table test.t33 (
  id   int,
  f1   string,
  f2   string
)
partitioned by (day string);


--创建临时表
CREATE TEMPORARY TABLE IF NOT EXISTS test.t6
(id int,
f1 string,
f2 string
);
--注:这里创建临时表的默认路径是在/tmp/hive/hive/59ab087d-5ee2-4136-b528-7225658b2d1b/_tmp_space.db/下,虽然也可以指定external关键字,但是也会随着表的消失而被删除


--创建视图
create view test.v1 as select id,f1,f2 from test.t1;


--Create Table As Select语句
drop table if exists test.t4;
create table if not exists test.t4 as select id,f1,f2 from test.t1;


--like使用,只复制表结构
drop table if exists test.t5;
create table if not exists test.t5 like test.t1;


--建表指定字段其他格式类型
--第一个是指定字段分隔符 第二个是指定集合元素之前的分隔符  第三个指定map元素 kv之间的分隔符  
drop table if exists test.t7;
create table if not exists test.t7(
    id int comment "id",
    f1 string comment "name",
    f2 map<string,string> comment "专长",
    f3 array<string> comment "other"
) row format delimited fields terminated by ","
collection items terminated by ":" 
map keys terminated by "-" ;


--插入对应数据
insert into table test.t7 select 1,"tom",map("爱好","唱歌"),array('数学','音乐');


--创建单倾斜列表
drop table if exists test.t_skewed;
create table if not exists test.t_skewed
(
id int,f1 string,f2 string
)
skewed by (id) on (1,5,6) 
stored as directories;


insert into test.t_skewed values (1,"a","b");
--注:单列倾斜表会对id 为1 5 6 单独再创建一个目录例如/user/hive/warehouse/test.db/t_skewed/id=1,而id为其他值则会放在HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME目录下面


--建表设置表的存储格式,默认为TEXTFILE
drop table if exists test.t_sequencefile;
create  table if not exists test.t_sequencefile
(id int,
f1 string,
f2 string
) stored as sequencefile;


drop table if exists test.t_orc;
create  table if not exists test.t_orc
(id int,
f1 string,
f2 string
) stored as orc;


drop table if exists test.t_parquet;
create  table if not exists test.t_parquet
(id int,
f1 string,
f2 string
) stored as parquet;


drop table if exists test.t_rcfile;
create  table if not exists test.t_rcfile
(id int,
f1 string,
f2 string
) stored as rcfile;


drop table if exists test.t_jsonfile;
create  table if not exists test.t_jsonfile
(id int,
f1 string,
f2 string
) stored as jsonfile;


insert into test.t_sequencefile values (1,"a","b");
insert into test.t_orc values (1,"a","b");
insert into test.t_parquet values (1,"a","b");
insert into test.t_rcfile values (1,"a","b");
insert into test.t_jsonfile values (1,"a","b");


--创建自定义函数,由于需要提供jar这里只提供样例
----create function hex_toString as 'hiveT.hexadecimal_toString' using jar 'hdfs://nameservice1:8020/tmp/wqg/hiveD.jar';


--2、drop
--删除表,如果是外部表不会删除对应的hdfs路径,内部表则会删除对应hdfs数据
drop table  if exists test.t2;


--purge参数:如果设置了回收站测试那么增加此参数删除表时,hdfs数据是不会进入回收站的
drop table  if exists test.t4 purge;


--删除视图
drop view if exists test.v1;


--truncate 清空表数据,保留表格式(分区表保留分区信息)
truncate table test.t3;


--只清空分区语法
--TRUNCATE [TABLE] table_name [PARTITION partition_spec];


--3、alter
--修改表名
drop table if exists test.t5_new;
alter table test.t5 rename to test.t5_new;
--设置表属性alter
alter table test.t1 set tblproperties ("table_name"="test values");
--移除表属性
alter table test.t1 unset tblproperties ("table_name");
--修改表不倾斜
alter table test.t5_new not skewed;
--对分区表添加分区
alter table test.t3 add if not exists partition (day="20230807");
--添加分区时也可以指定location
alter table test.t3 add if not exists partition (day='20230808') location '/tmp/test.db/t3/20230808';
insert into test.t3 partition(day='20230807') values (1,'a','b');
insert into test.t3 partition(day='20230808') values (2,'a','b');
--将test.t3表的20230808分区以及数据移动到test.t33下
alter table test.t33 exchange partition (day='20230808') with table test.t3;
--注:如果test.t3的部分分区表增加了location 那么使用上面sql移动后,只能移动其元数据分区到t33表下,数据并不会移动到其下面


--设置分区格式,这里必须切换到test库下,否则下面一句命令会报错,可能是bug
use test;
alter table test.t3 partition (day="20230807") set fileformat parquet;


--hive archive,执行前需要先启用此功能
set hive.archive.enabled=true;
--归档操作
ALTER TABLE test.t3 ARCHIVE PARTITION (day='20230807');
--解档操作
ALTER TABLE test.t3 UNARCHIVE PARTITION (day='20230807');




--查看对应分分区表属于
--describe extended test.t3 partition (day='20230807');


--修改非分区表或分区表中分区的transient_lastDdlTime
alter table test.t1 touch;
alter table test.t3 touch partition (day='20230807');


--删除分区
--alter table test.t3 drop if  exists partition (day="20230807");
alter table test.t3 drop if  exists partition (day="20230808") purge;




alter table test.t3 drop partition (day < 20230810);


--msck修复表分区
msck repair table test.t3 sync partitions ;
--还有drop add等关键字,sync包含了drop和add  语法:MSCK [REPAIR] TABLE table_name [ADD/DROP/SYNC PARTITIONS];


--修改t1表中id字段名称为id_new 类型为string
ALTER TABLE test.t1 CHANGE id id_new string;
--修改t1表字段id_new为id_n 类型为int 并且id_n字段在f1列后面
alter table test.t1 change id_new id_n int after f1;
--修改t1表 test.t1字段id_n名称为id_n1 并且放在列第一位
alter table test.t1 change id_n id_n1 int first;
--修改t1表字段id_n1名称位id 并添加描述语句
ALTER TABLE test.t1 CHANGE id_n1 id INT COMMENT 'this is column a1';


--添加列
alter table test.t1 add  columns (f3 string);


--替换列,即COLUMNS()里面的列会将原有列全部替换掉
ALTER TABLE test.t1 REPLACE COLUMNS (id int,f1 string ,f2 string);
--将t3表20230807分区的id列类型修改为string (id名称由于指定于之前一样,所以未改变)
ALTER TABLE test.t3 PARTITION (day='20230807') CHANGE COLUMN id id string;


--设置表为内部表或者为外部表
ALTER TABLE test.t1 SET TBLPROPERTIES('EXTERNAL'='TRUE');
ALTER TABLE test.t1 SET TBLPROPERTIES('EXTERNAL'='FALSE');


--##DML相关操作
--创建一个表用于测试
drop table if exists test.t10;
create table if not exists test.t10(id int,f1 string,f2 string) row format delimited fields terminated by  ",";
--1、load
--load本地数据至hive表中
--load data local inpath "file:///tmp/t10.txt" into table test.t10;
--load hdfs上的数据至hive表中
--load data inpath "/tmp/t10.txt" into table test.t10;


--2、insert
--对表插入一条数据
insert into test.t10 values (1,"f1","f2");
--插入多条值
insert into test.t1 values (1,"f1","f2"),(2,"ff1","ff2"),(3,"fff1","fff2");
--insert overwrite语句
insert overwrite table test.t1 select id,f1,f2 from test.t1;
--插入分区表
insert into table test.t3 partition (day="20230807") select id,f1,f2 from test.t10;
insert overwrite table test.t3 partition (day="20230807") select id,f1,f2 from test.t10;

Impala DDL DML

--IMPALA SQL DDL DML sql,很多sql impala和hive都是可以公用的,只有部分差异。所以下面会有很多hive里面的sql
--##DDL数据库相关操作
--1、create
--如果不存在则创建test数据库,这里SCHEMA关键字同DATABASE一致,下面例子会交错使用,但效果都是一样的
create database if not exists test;
create schema if not exists test2;
--创建数据库的同时,还可以指定其他配置
create database  IF NOT EXISTS test3
COMMENT "测试数据库描述"
LOCATION "/user/hive/test3";


create database if not exists test4;


--2、drop
--如果存在则删除test2删除数据库
drop database if exists test4;
--RESTRICT和CASCADE,
--restrict 是默认行为 即 drop database if exists test2;等同于 drop database if exists test2 restrict;
drop database if exists test4 restrict;
--当库下含有表即不为空的情况下仍然要删除库那么需要使用CASCADE关键字
drop database if exists test2 cascade;


--3、use切换库
use test;
--切换到默认库
USE DEFAULT;
--查看当前所在库
SELECT current_database();


--##DDL表相关操作
--1、create
--对于增加了external关键字称为外部表,没有则为内部表。
--创建test.t1为内部表
drop table if exists test.t1;
CREATE  TABLE IF NOT EXISTS test.t1
(id int,
f1 string,
f2 string
);
--创建test.t2外部表,并且指定了描述、分隔符、存储格式、hdfs location关键字
drop table if exists test.t2;
create external table if not exists test.t2
(
id int,
f1 string,
f2 string
)
COMMENT "test table"
row format delimited fields terminated by ',' STORED AS PARQUET
LOCATION '/user/hive/warehouse/test.db/t2';


drop table if exists test.t3;
--创建内部分区表
create table if not exists test.t3 (
  id   int,
  f1   string,
  f2   string
)
partitioned by (day string);


--创建内部非分区表用于view语句测试
drop table if exists test.t6;
create table if not exists test.t6 (
  id   int,
  f1   string,
  f2   string
);


--再建一个分区表方便后续测试 alter between and 
drop table if exists test.t33;
create table if not exists test.t33 (
  id   int,
  f1   string,
  f2   string
)
partitioned by (year int);


--创建视图
drop view if exists test.v1;
create view if not exists test.v1 as select id,f1,f2 from test.t1;


drop view if exists test.v6;
create view if not exists test.v6 as select id,f1,f2 from test.t6;


--Create Table As Select语句
drop table if exists test.t4;
create table if not exists test.t4 as select id,f1,f2 from test.t1;


--like使用,只复制表结构
drop table if exists test.t5;
create table if not exists test.t5 like test.t1;


--建表指定字段其他格式类型
--第一个是指定字段分隔符 第二个是指定集合元素之前的分隔符  第三个指定map元素 kv之间的分隔符  
drop table if exists test.t77;
create table if not exists test.t77(
    id int comment "id",
    f1 string comment "name",
    f2 map<string,string> comment "专长"
) row format delimited fields terminated by "\t";


--插入对应数据,impala对复杂数组类型字段暂无方式可供insert插入数据
--刷新表的元数据
refresh test.t7;


--impala查询array字段类型数据
select m.item from test.t7,test.t7.f3 m;
--impala查询map字段类型数据
select m.key,m.value from test.t7,test.t7.f2 m;


--建表设置表的存储格式,默认为TEXTFILE


drop table if exists test.t_parquet;
create  table if not exists test.t_parquet
(id int,
f1 string,
f2 string
) stored as parquet;


insert into test.t_parquet values (1,"a","b");


--创建自定义函数,由于需要提供jar这里只提供样例
----create function hex_toString as 'hiveT.hexadecimal_toString' using jar 'hdfs://nameservice1:8020/tmp/wqg/hiveD.jar';


--create kudu映射表,hive不支持,
drop table if exists test.t11;
CREATE TABLE if not exists test.t11
(
id BIGINT,
name STRING,
PRIMARY KEY(id)
)
PARTITION BY HASH PARTITIONS 2
STORED AS KUDU
TBLPROPERTIES ('key1'='value1');


--如果imapla服务没有增加kudu映射,那么就需要指定kudu master,格式如下
--CREATE TABLE test.t11_1
--(
--id BIGINT,
--name STRING,
--PRIMARY KEY(id)
--)
--PARTITION BY HASH PARTITIONS 16 
--STORED AS KUDU
--TBLPROPERTIES (
--'kudu.master_addresses' = 'cm111:7051,cm112:7051,cm113:7051'
--);


--创建外部表、impala映射kudu表格式
--CREATE EXTERNAL TABLE test.t12
--STORED AS KUDU
--TBLPROPERTIES (
--'kudu.master_addresses' = 'cm111:7051,cm112:7051,cm113:7051',
--'kudu.table_name' = 'kudu_test.my_first_table'
--);


--2、drop
--删除表,如果是外部表不会删除对应的hdfs路径,内部表则会删除对应hdfs数据
drop table  if exists test.t2;


--purge参数:如果设置了回收站测试那么增加此参数删除表时,hdfs数据是不会进入回收站的
drop table  if exists test.t4 purge;


--删除视图
drop view if exists test.v1;


--truncate 清空表数据,保留表格式(分区表保留分区信息)
truncate table test.t3;


--只清空分区语法
--TRUNCATE [TABLE] table_name [PARTITION partition_spec];


--3、alter
--新增分区用户测试
alter table test.t33 add if not exists partition (year=2015);
alter table test.t33 add if not exists partition (year=2016);
alter table test.t33 add if not exists partition (year=2017);
alter table test.t33 add if not exists partition (year=2018);
alter table test.t33 add if not exists partition (year=2019);




--使用between and形式删除范围分区,hive不支持此种写法
alter table test.t33 drop partition (year between 2016 and 2018);


--修改表名
drop table if exists test.t5_new;
alter table test.t5 rename to test.t5_new;


--设置表属性
alter table test.t10 set tblproperties ("impala.enable.stats.extrapolation"="true");


--改变表分区的分隔符 
alter table test.t1 set serdeproperties ('serialization.format' = ',' , 'field.delim' = ',');


--对分区表添加分区
alter table test.t3 add if not exists partition (day="20230807");
--添加分区时也可以指定location
alter table test.t3 add if not exists partition (day='20230808') location '/tmp/test.db/t3/20230808';


insert into test.t3 partition(day='20230807') values (1,'a','b');
insert into test.t3 partition(day='20230808') values (2,'a','b');


--设置分区格式
ALTER TABLE test.t3 PARTITION (day='20230807') SET FILEFORMAT parquet;


--删除分区
alter table test.t3 drop if  exists partition (day="20230807");
alter table test.t3 drop if  exists partition (day="20230808") purge;


--修改t1表中id字段名称为id_new 类型为string
ALTER TABLE test.t1 CHANGE id id_new string;
--修改t1表字段id_n1名称位id 并添加描述语句
ALTER TABLE test.t1 CHANGE id_new id INT COMMENT 'this is column a1';
ALTER TABLE test.t1 ALTER COLUMN f1 SET COMMENT 'comment_text';


--添加列,此处hive不支持使用IF NOT EXISTS
alter table test.t1 add if not exists columns (f3 string);
--删除列,hive中不支持此sql语法
alter table test.t1 drop  column f3;


--替换列,即COLUMNS()里面的列会将原有列全部替换掉
ALTER TABLE test.t1 REPLACE COLUMNS (id int,f1 string ,f2 string);


--扩展,查看t1表在hdfs上的文件
show files in test.t1;


--设置表为内部表或者为外部表
ALTER TABLE test.t1 SET TBLPROPERTIES('EXTERNAL'='TRUE');
ALTER TABLE test.t1 SET TBLPROPERTIES('EXTERNAL'='FALSE');


--##DML相关操作
--创建一个表用于测试
create table if not exists test.t10(id int,f1 string,f2 string) row format delimited fields terminated by  ",";


--4、insert
--对表插入一条数据
insert into test.t10 values (1,"f1","f2");
--插入多条值
insert into test.t1 values (1,"f1","f2"),(2,"ff1","ff2"),(3,"fff1","fff2");
--insert overwrite语句
insert overwrite table test.t1 select id,f1,f2 from test.t1;
--插入分区表,
insert into table test.t3 partition (day="20230807") select id,f1,f2 from test.t10;
insert overwrite table test.t3 partition (day="20230807") select id,f1,f2 from test.t10;




--5、COMPUTE 收集有关表中数据的容量和分布以及所有相关列和分区的信息
--收集表信息
compute stats test.t10;
--收集表中列信息
compute stats test.t10 (id,f1) ;
-- 收集表信息 10表示对10%的表数据进行采样。
COMPUTE STATS test.t10 TABLESAMPLE SYSTEM(10);
--收集表信息,对5%的表数据进行采样 重复性种子为42
COMPUTE STATS test.t10 TABLESAMPLE SYSTEM(5) REPEATABLE(42);


--统计t33表 day分区小于2019的增量信息
COMPUTE INCREMENTAL STATS test.t33 partition (year < 2019);
COMPUTE INCREMENTAL STATS test.t33  partition (year in (2016, 2019));
COMPUTE INCREMENTAL STATS test.t33 partition (year between 2016 and 2019);
COMPUTE INCREMENTAL STATS test.t33 partition (year in (2016,2017) or year < 2015);
COMPUTE INCREMENTAL STATS test.t33 partition (year != 2016);


--6、show
--查看表状态信息
show table stats test.t33;
--查看列状态信息
show column stats test.t33;


--7、delete,仅适用与kudu引擎的impala表
delete from test.t11 where id=1;


--8、update 仅适用与kudu引擎的impala表
update test.t11 set name="aa" where id=1;




--9、UPSERT is only supported for Kudu tables
UPSERT INTO test.t11 (id, name) VALUES
    (1, 'John'),
    (2, 'Jane'),
    (3, 'Michael');






--参考链接https://impala.apache.org/impala-docs.html

Spark DDL DML

--Spark SQL DDL DMLsql
--版本spark-3.3.1 kyuubi-1.6.1 
--##DDL相关操作
--1、create database/schema
--如果不存在test数据库则创建
create database if not exists test;
--创建test2数据库,并指定一些属性如comment 描述 id=001, name='john' ,指定test2数据库的location
create database if not exists test2 comment 'this is customer database' location '/user/hive/test2' with dbproperties (id=001, name='john');


create schema if not exists test_spark3;
create schema if not exists test_spark4;
--查看数据库
describe database extended test2;
--2、create table
--创建t1表
drop table if exists test.t1;
create table if not exists test.t1 (id int, f1 string, f2 string);
--创建t2表并指定存储格式为orc
drop table if exists test.t2;
create table if not exists test.t2 (id int, f1 string, f2 string) stored as orc;
--指定表属性和注释
drop table if exists test.t3;
CREATE TABLE test.t3 (id int, f1 string, f2 string)
    COMMENT 'this is a comment'
    STORED AS ORC
    TBLPROPERTIES ('foo'='bar');
--指定表属性和注释,功能同上,但是位置调换了,hive则不可以
drop table if exists test.t4;
CREATE TABLE test.t4 (id int, f1 string, f2 string)
    TBLPROPERTIES ('foo'='bar')
    COMMENT 'this is a comment'
    STORED AS ORC;
--创建分区表
drop table if exists test.t5;
create table test.t5 (id int, f1 string, f2 string) partitioned by (day string) STORED AS ORC;
--创建分区表,调换了 partitioned by 与 STORED as 位置 ,hive则不支持此种写法
drop table if exists test.t6;
create table test.t6 (id int, f1 string, f2 string) stored as orc partitioned by (day string) ;
--创建表并指定分隔符
drop table if exists test.t7;
create table if not exists test.t7 (id int, f1 string, f2 string) row format delimited fields terminated by ',' stored as textfile;


--创建复杂的字段类型    
drop table if exists test.t8; 
create external table if not exists test.t8(
        id int,
        f1 array<string>,
        f2 map<string, int>,
        f3 struct<street: string, city: string>
    )
    row format delimited fields terminated by ',' escaped by '\\'
    collection items terminated by '_'
    map keys terminated by ':'
    lines terminated by '\n'
    null defined as 'foonull'
    stored as textfile
    location '/tmp/t8/';


--创建分桶表,同样适用hive
drop table if exists test.t9; 
create table if not exists test.t9 (id int, f1 string,f2 string)
    clustered by (id)
    into 4 buckets;


--创建分区分桶表
drop table if exists test.t10; 
create table if not exists test.t10 (id int, f1 string,f2 string)
    partitioned by (year string)
    clustered by (id)
    sorted by (id asc)
    into 3 buckets;


--create table .. like 方式
drop table if exists test.t11; 
create table if not exists test.t11 like test.t1;


--使用csv格式数据源,除了csv还有csv, txt, orc, jdbc, parquet等
drop table if exists test.t12;
create table if not exists test.t12 like test.t1 using csv;


--使用like方式还可以指定location
drop table if exists test.t13;
create table if not exists test.t13 like test.t1 location  '/tmp/test/t13';


--使用like方式还可以设置一些分隔符存储格式等
drop table if exists test.t14;
create table if not exists test.t14 like test.t1
    row format delimited fields terminated by ','
    stored as textfile
    tblproperties ('key'='xxxx');


--3、create view
--创建视图,如果存在则替换
drop view if exists test.v1;
create or replace view test.v1
    (id comment 'unique identification number', f1,f2) 
    comment 'view comment'
    as select id, f1,f2 from test.t1;


--创建全局的临时视图 ,临时视图不能指定库名称
drop view  if exists global_temp.v2;
create global temporary view  v2 as select id,f1,f2 from test.t1;


--4、create function,因需提供jar这里只提供示例
--CREATE FUNCTION f1 AS 'SimpleUdf'  USING JAR '/tmp/SimpleUdf.jar';
--创建临时函数
--CREATE TEMPORARY FUNCTION simple_temp_udf AS 'SimpleUdf'  USING JAR '/tmp/SimpleUdf.jar';
--SHOW USER FUNCTIONS;
--CREATE OR REPLACE FUNCTION simple_udf AS 'SimpleUdfR' USING JAR '/tmp/SimpleUdfR.jar';


--5、alter database
create database if not exists test3;
--给数据设置属性
alter database test3 set dbproperties ('edited-by' = 'john', 'edit-date' = '01/01/2001');


--查看数据库的信息
describe database extended test3;


--6、alter table
--修改表名称
drop table if exists test.t4_new;
alter table test.t4 rename to test.t4_new;


--添加分区
alter table test.t5 add if not exists partition (day="20230809");
alter table test.t5 add if not exists partition (day="20220809");
--修改分区名称
alter table test.t5 partition (day="20230809") rename to partition (day="20230810");
--删除分区
alter table test.t5 drop if exists partition (day="20230810");


--给表添加新的列
alter table test.t3 add columns (f4 string, f5 timestamp);


-- 不支持的操作,待研究 drop column is only supported with v2 tables
--alter table test.t3 rename column f5 to f5_new;
--alter table test.t4 drop columns (f3);
--alter table test.t3 replace columns ( id int comment 'new comment' , name string);


--对列f4添加描述信息
alter table test.t3 alter column f4 comment "new comment";


--修改表分区location
alter table test.t5 partition (day="20220809") set location '/tmp/test/t5/day=20220809';


--设置表属性
alter table test.t3 set tblproperties ('winner' = 'loser');
--删除表属性
alter table test.t3 unset tblproperties ('winner');


--恢复分区,相当于hive的msck repair table xxx sync partitions;
alter table test.t5 recover partitions;


--7、alter view
--修改视图名称
drop view if exists test.v1_new;
alter view test.v1 rename to test.v1_new;


--设置视图属性
alter view test.v1_new set tblproperties ('created.by.user' = "john", 'created.date' = '01-01-2001' );


--删除视图属性
alter view test.v1_new  unset tblproperties ('created.by.user', 'created.date');
--改变视图定义
alter view test.v1_new as select * from test.t1;


--查询视图信息
desc table extended test.v1_new;


--8、drop database/schema


-- cascade作用 库不为空时也可以直接删除
drop database if exists test_spark3 cascade;
drop schema if exists test_spark4 cascade;


--9、drop function
--因此sql文件没有实际创建,只提供样例
--查看有哪些函数
show user functions;
--删除函数
--drop function test_avg;
--drop temporary function if exists test_avg;


--10、drop table
drop table if exists test.t12;
drop table if exists test.t13 purge;


--11、drop view
DROP VIEW IF EXISTS test.v1_new;


--12、truncate
--清空表分区
TRUNCATE TABLE test.t5 partition(day="20220809");
--清空表
TRUNCATE TABLE test.t2;


--13、user
--切换数据库
use test;


--##DML相关操作
--1、insert table
--插入一条数据
insert into test.t1 values (1, '123 park ave', 'san jose');
--插入多条数据
insert into test.t2 values (1, '123 park ave', 'san jose'),(2, '456 park ave', 'san jose');
--insert into ... select 方式
insert into test.t1 select id,f1,f2 from test.t2 where id=1;
--将t2整表数据插入t1,hive不支持此种方式
insert into test.t1 table test.t2;
--向分区表插入数据
insert into test.t5 partition (day = '20230811') values (1,'amy smith', '123 park ave, san jose');
--指定列信息插入数据,位置可以错乱
insert into test.t1 (f1, id, f2) values  ('hangzhou, china', 11,'kent yao');
----指定列信息向分区表插入数据,位置可以错乱
insert into test.t5 partition (day = '20230812') (f1, id, f2) values ('hangzhou, china', 15,'kent yao');


--insert overwrite方式插入多条值
insert overwrite test.t1 values (100, 'hangzhou, china', 'san jose'),(101, 'nanjin, china', 'san jose');
--insert overwrite...select
insert overwrite test.t5 partition (day = '20230813') select id, f1,f2 from test.t1 where id = 1;
--insert overwrite 方式 将t2整表数据插入t1
insert overwrite test.t2 table test.t1;


--2、insert overwrite directory
--将数据以parquet存储格式写到/tmp/destination 目录下
insert overwrite directory '/tmp/destination'
    using parquet
    options (id 1,f1 2,f2 'test')
    select * from test.t1;


--功能同上,第二种方式
INSERT OVERWRITE DIRECTORY
    USING parquet
    OPTIONS ('path' '/tmp/destination2', id 1,f1 2,f2 'test')
    SELECT * FROM test.t1;


--3、LOAD DATA
--由于需要提供额外的文件数据,这里只提供样例
--从本地load
--LOAD DATA LOCAL INPATH '/tmp/sparktbdata.txt' OVERWRITE INTO TABLE test.t14;
--从hdfs load
--LOAD DATA  INPATH '/tmp/sparktbdata.txt' OVERWRITE INTO TABLE test.t14;






--参考链接:https://spark.apache.org/docs/latest/sql-ref-syntax.html#ddl-statements

本人旨在分享工作中一些用到的知识技能

如有感兴趣欢迎关注Wbigdata微信公众号