hive的SQL操作记录

 

partition:

1、删除表、创建外部表和分区、根据分区导入数据

dfs -rmr /tmp/ext/tab_name1; --删除hadoop文件系统中的目录及文件
dfs -mkdir /tmp/ext/tab_name1;--重新创建目录。建表前需要先创建与表名称对应的目录才行
drop table if exists tab_name1;
create external table  if not exists tab_name1(id int ,name string) partitioned by (aa string) row format delimited fields terminated by ',' location '/tmp/ext/tab_name1';
load data local inpath '/home/conkeyn/jar/a.txt' into table tab_name1 partition(aa='1');
load data local inpath '/home/conkeyn/jar/b.txt' into table tab_name1 partition(aa='2');
dfs -ls /tmp/ext/tab_name1;

 2、根据分页查询

select * from tab_name1 where aa='1';

clustered

1、创建表簇

 

--创建普通表
drop table if exists tab_cdr;
create table if not exists tab_cdr(oaddr string,oareacode string,daddr string,dareacode string,ts string,type string) row format delimited fields terminated by ',';
load data local inpath '/home/conkeyn/jar/output/cdr01.txt' into table tab_cdr;
select count(*) from tab_cdr;
--创建表簇
drop table if exists tab_cdr_buc;
create table if not exists tab_cdr_buc(oaddr string,oareacode string,daddr string,dareacode string,ts string,type string) clustered by (oaddr) into 10 buckets;
--强制使用批量
set hive.enforce.bucketing=true;
--添加数据
insert into table tab_cdr_buc select * from tab_cdr;

 file_format(文件类型)

需要依赖:安装hadoop-lzo压缩库

创建普通表,待会儿需要从普通表上查询记录,并插入到压缩表中

drop table if exists tab_cdr;
create table if not exists tab_cdr(oaddr string,oareacode string,daddr string,dareacode string,ts string,type string) row format delimited fields terminated by ',';
load data local inpath '/home/conkeyn/jar/output/ab.txt' into table tab_cdr;
dfs -ls /user/hive/warehouse/tab_cdr;
select count(*) from tab_cdr;

 

1、sequence file

 

--sequencefile
set hive.exec.compress.output=true;
set mapred.output.compress=true;
set mapred.output.compression.type=BLOCK;
set mapred.output.compression.codec=com.hadoop.compression.lzo.LzoCodec;
drop table if exists tab_cdr_seq;
create table if not exists tab_cdr_seq(oaddr string,oareacode string,daddr string,dareacode string,ts string,type string) row format delimited fields terminated by ',' stored as sequencefile;
insert overwrite  table tab_cdr_seq select * from tab_cdr;
dfs -ls /user/hive/warehouse/tab_cdr_seq;
select count(*) from tab_cdr_seq;

 

2、text file

3、rcf file

--rcfile
set hive.exec.compress.output=true;
set mapred.output.compress=true;
set mapred.output.compression.type=BLOCK;
set mapred.output.compression.codec=com.hadoop.compression.lzo.LzoCodec;
drop table if exists tab_cdr_rc;
create table if not exists tab_cdr_rc(oaddr string,oareacode string,daddr string,dareacode string,ts string,type string) row format delimited fields terminated by ',' stored as rcfile;
insert overwrite  table tab_cdr_rc select * from tab_cdr;
dfs -ls /user/hive/warehouse/tab_cdr_rc;
select count(*) from tab_cdr_rc;

4、input file

5、gz

--使用压缩文件格式
drop table if exists tab_name;
create table  if not exists tab_name(id int ,name string) row format delimited fields terminated by ',';
load data local inpath '/home/conkeyn/jar/a.txt.gz' into table tab_name;
dfs -ls /user/hive/warehouse/tab_name;
--在查询时,后台会自动为我们解压
select * from tab_name;

 6、字段的复杂类型

(1)、Array类型

   数据准备:

  

rob,bob,steven  1,2,3
amy,andy        11,22
jac     11,22,33,44,55

   SQL操作:

--field array type
drop table if exists tab_arr;
create table if not exists tab_arr(a array<string>,b array<int>) 
row format delimited fields terminated by '\t'
collection items terminated by ',';
load data local inpath '/home/conkeyn/jar/arr.txt' into table tab_arr;
select a[2] from tab_arr;
select * from tab_arr where array_contains(a,'rob');
select * from tab_arr where array_contains(b,22);
insert into table tab_arr select array(oaddr,oareacode),array(0) from tab_cdr;

 (2)Map类型

 数据准备:

rob     age:10,tel:87654321
amy     age:17,tel:09876543,addr:shanghai
bob     age:18,tel:98765432,addr:beijing

 SQL操作

drop table if exists tab_map;
create table if not exists tab_map(name string,info map<string,string>) 
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':';
load data local inpath '/home/conkeyn/jar/map.txt' into table tab_map;
select info['age'] from tab_map;
insert into table tab_map select oaddr,map('oareacode',oareacode,'daddr',daddr) from tab_cdr;
select * from tab_map limit 10;

 (3)、Struct类型

数据准备:

rob     10,87654321
amy     17,09876543,shanghai
bob     18,98765432beijing

 SQL操作

drop table if exists tab_struct;
create table if not exists tab_struct(name string,info struct<age:int,tel:string,addr:string>) 
row format delimited fields terminated by '\t'
collection items terminated by ',';
load data local inpath '/home/conkeyn/jar/struct.txt' into table tab_struct;
select * from tab_struct limit 10;
select info.age from tab_struct;
insert into table tab_struct select oaddr,named_struct('age',0,'tel',daddr,'addr',dareacode) from tab_cdr;

 7、查询

(1)join(联接)

 

--表联接
drop table if exists tab_areaname;
create table if not exists tab_areaname(code string,name string) 
row format delimited fields terminated by '\t';
load data local inpath '/home/conkeyn/jar/areaname.txt' overwrite into table tab_areaname;
select * from tab_areaname;

drop table if exists tab_res1;
create table if not exists tab_res1(oaddr string ,cityname string);
insert into table tab_res1 select a.oaddr,b.name from tab_cdr a inner join tab_areaname b on a.oareacode = b.code;
-- map join
--mapjoin使用条件是,mapjoin(table_name)的table_name这张表的数据量要比较小时才能显现出效果。
insert into table tab_res1 select /*+ mapjoin(a) */ b.oaddr,a.name from  tab_cdr b inner join tab_areaname a on a.code = b.oareacode;

--三张表联接
drop table if exists tab_user;
create table tab_user(addr string);
insert overwrite table tab_user select distinct(oaddr) from tab_cdr limit 100;
insert into table tab_res1 select /*+ mapjoin(a) */ b.oaddr,a.name from  tab_cdr b inner join tab_areaname a on a.code = b.oareacode inner join tab_user c on c.addr = b.oaddr;

 

从同张表中取出数据分别插入不同表(以下示例是使用同一张表不同分区)中。

drop table if exists tab_indb_tmp; 
create table tab_indb_tmp(oaddr string,oareacode string,daddr string,dareacode string ,ts string, type string) row format delimited fields terminated by ',';
drop table if exists tab_indb; 
create table tab_indb(oaddr string,oareacode string,daddr string,dareacode string ,ts string, type string) partitioned by (date string) stored as rcfile;
--按分区插入数据语法
from tab_indb_tmp insert into table tab_indb partition(date="1st") select * where substr(ts,7,2)<="10"
insert into table tab_indb partition(date="2ed") select * where substr(ts,7,2) between "11" and "20"
insert into table tab_indb partition(date="3rd") select * where substr(ts,7,2)>"20";

 

d

### Hive SQL 高级查询与复杂操作示例 Hive SQL 提供了多种高级查询和复杂操作的功能,能够支持大规模数据集的高效分析。以下是一些常见的高级查询与复杂操作示例,涵盖窗口函数、条件聚合、数据转换、递归查询等场景。 #### 窗口函数 窗口函数是 Hive SQL 中非常强大的工具,尤其在进行排名、累计计算、滑动窗口统计等操作时非常高效。例如,使用 `ROW_NUMBER()` 为每个用户的交易记录按时间排序: ```sql SELECT user_id, transaction_date, amount, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY transaction_date) AS rank FROM transactions; ``` 此外,`SUM()` 与 `OVER()` 结合可以实现累计求和,适用于统计用户累计消费等场景: ```sql SELECT user_id, transaction_date, amount, SUM(amount) OVER (PARTITION BY user_id ORDER BY transaction_date) AS cumulative_amount FROM transactions; ``` #### 条件聚合 条件聚合是一种基于特定条件对数据进行分组统计的技术,常用于生成报。例如,使用 `CASE WHEN` 实现多条件统计: ```sql SELECT department, SUM(CASE WHEN gender = 'male' THEN salary ELSE 0 END) AS male_total_salary, SUM(CASE WHEN gender = 'female' THEN salary ELSE 0 END) AS female_total_salary FROM employees GROUP BY department; ``` #### 数据转换 Hive 提供了多种数据转换函数,例如 `split()` 和 `explode()`,常用于处理嵌套或复杂数据类型。例如,将逗号分隔的字符串拆分为多行: ```sql SELECT id, explode(split(interests, ',')) AS interest FROM user_interests; ``` 此外,`concat()` 和 `regexp_replace()` 可用于字符串拼接和替换操作: ```sql SELECT id, concat(first_name, ' ', last_name) AS full_name, regexp_replace(email, '@example.com', '@newdomain.com') AS new_email FROM users; ``` #### 递归查询 Hive 从 0.13 版本开始支持递归查询(使用 `WITH` 子句),适用于树形结构数据的查询,例如查找所有子节点或叶子节点。以下是一个递归查询所有叶子节点的示例: ```sql WITH RECURSIVE node_tree AS ( SELECT id, parent_id FROM nodes WHERE parent_id IS NULL UNION ALL SELECT n.id, n.parent_id FROM nodes n INNER JOIN node_tree nt ON n.parent_id = nt.id ) SELECT id FROM node_tree WHERE id NOT IN (SELECT parent_id FROM nodes WHERE parent_id IS NOT NULL); ``` #### 分区与分桶优化 为了提升查询性能,Hive 支持分区(Partitioning)和分桶(Bucketing)技术。例如,按日期分区存储日志数据: ```sql CREATE TABLE logs ( log_id INT, message STRING ) PARTITIONED BY (log_date STRING); ``` 插入数据时指定分区: ```sql INSERT INTO TABLE logs PARTITION (log_date='2023-10-01') SELECT log_id, message FROM temp_logs; ``` 分桶则用于进一步细化数据分布,提升 Join 和聚合操作的性能: ```sql CREATE TABLE users ( user_id INT, name STRING ) CLUSTERED BY (user_id) INTO 4 BUCKETS; ``` #### Join 操作优化 Hive 中的 Join 操作可以通过加盐(Salting)和小膨胀技术优化大 Join 性能。例如,对大加盐后进行 Join: ```sql SELECT t1.salted_key, t2.value FROM (SELECT concat(source, '_', cast(rand() * 10 as int)) AS salted_key, value FROM large_table) t1 JOIN (SELECT key, value FROM small_table) t2 ON t1.salted_key = concat(t2.key, '_', cast(rand() * 10 as int)); ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值