Hive基础 - 3 function & complex type & partition

最新推荐文章于 2024-03-09 08:50:46 发布

我在路上....

最新推荐文章于 2024-03-09 08:50:46 发布

阅读量304

点赞数

CC 4.0 BY-SA版权

分类专栏： hive

本文链接：https://blog.youkuaiyun.com/qq_39706680/article/details/97123663

hive 专栏收录该内容

4 篇文章

订阅专栏

Hive - function & complex type & partition

Function 函数

json_tuple

json_tuple和get_json_object类似,但是可以一次取多个值,效率要高

A new json_tuple() UDTF is introduced in Hive 0.7. It takes a set of names (keys) and a JSON string, and returns a tuple of values using one function. This is much more efficient than calling GET_JSON_OBJECT to retrieve more than one key from a single JSON string. In any case where a single JSON string would be parsed more than once, your query will be more efficient if you parse it once, which is what JSON_TUPLE is for. As JSON_TUPLE is a UDTF, you will need to use the LATERAL VIEW syntax in order to achieve the same goal.

For example,
select a.timestamp, get_json_object(a.appevents, '$.eventid'), get_json_object(a.appenvets, '$.eventname') from log a;
should be changed to:
select a.timestamp, b.*
from log a lateral view json_tuple(a.appevent, 'eventid', 'eventname') b as f1, f2;

准备数据 movie_rate.log

{"rate":"7.5","cover_x":2250,"title":"向往的生活 第三季","url":"https://movie.douban.com/subject/30441625/","playable":true,"cover":"https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2553367338.webp","id":"30441625","cover_y":4000,"is_new":false}
{"rate":"5.6","cover_x":1080,"title":"中国新说唱 第二季","url":"https://movie.douban.com/subject/30357985/","playable":true,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556109024.webp","id":"30357985","cover_y":1920,"is_new":false}
{"rate":"8.2","cover_x":800,"title":"吹落的树叶","url":"https://movie.douban.com/subject/30438479/","playable":false,"cover":"https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2550315269.webp","id":"30438479","cover_y":1131,"is_new":false}
{"rate":"3.1","cover_x":1080,"title":"流淌的美好时光","url":"https://movie.douban.com/subject/27031380/","playable":true,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2560670445.webp","id":"27031380","cover_y":1656,"is_new":true}
{"rate":"8.6","cover_x":750,"title":"请输入搜索词：WWW","url":"https://movie.douban.com/subject/30403333/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556419121.webp","id":"30403333","cover_y":1062,"is_new":false}
{"rate":"8.8","cover_x":694,"title":"名姝 第三季","url":"https://movie.douban.com/subject/30333874/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2562273006.webp","id":"30333874","cover_y":1000,"is_new":false}
{"rate":"8.3","cover_x":945,"title":"白色强人","url":"https://movie.douban.com/subject/27195042/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2559049945.webp","id":"27195042","cover_y":1350,"is_new":false}
{"rate":"8.5","cover_x":803,"title":"春夜","url":"https://movie.douban.com/subject/30428225/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2554598542.webp","id":"30428225","cover_y":1200,"is_new":false}
{"rate":"8.7","cover_x":1688,"title":"圆桌派 第四季","url":"https://movie.douban.com/subject/30269908/","playable":true,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2558848644.webp","id":"30269908","cover_y":3000,"is_new":false}
{"rate":"8.2","cover_x":1013,"title":"亢奋 第一季","url":"https://movie.douban.com/subject/30181462/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556252082.webp","id":"30181462","cover_y":1500,"is_new":false}
{"rate":"7.2","cover_x":1080,"title":"心动的信号 第二季","url":"https://movie.douban.com/subject/30458142/","playable":true,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2562668422.webp","id":"30458142","cover_y":1920,"is_new":false}

# 创建原始数据表
CREATE TABLE movie_raw (data string);
# 导入数据
LOAD DATA LOCAL INPATH '/hive/movie_rate.log' OVERWRITE INTO TABLE movie_raw;
# 从原始数据表中提取json
CREATE TABLE movie_rate AS SELECT json_tuple(data,'id','title','rate','url') AS (id,title,rate,url) FROM movie_raw; 
# 查看提取情况
SELECT * FROM movie_rate;

parse_url_tuple

parse_url_tuple和parse_url类似,但是可以精确取出指定url中的各个部分,并返回指定的元组,所有的输出都是string类型

The parse_url_tuple() UDTF is similar to parse_url(), but can extract multiple parts of a given URL, returning the data in a tuple. Values for a particular key in QUERY can be extracted by appending a colon and the key to the partToExtract argument, for example, parse_url_tuple(‘http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1’, ‘QUERY:k1’, ‘QUERY:k2’) returns a tuple with values of ‘v1’,‘v2’. This is more efficient than calling parse_url() multiple times. All the input parameters and output column types are string.
SELECT b.*
FROM src LATERAL VIEW parse_url_tuple(fullurl, 'HOST', 'PATH', 'QUERY', 'QUERY:id') b as host, path, query, query_id LIMIT 1;

SELECT parse_url_tuple(url,'HOST','PATH','QUERY') AS (base_url,relative_path,parameter) FROM movie_rate;

LanguageManual WindowingAndAnalytics 窗口函数

hive窗口函数官网链接

准备数据

emp.log

7369,SMITH,CLERK,7902,1980-12-17,800,,20
7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300,30
7521,WARD,SALESMAN,7698,1981-02-22,1250,500,30
7566,JONES,MANAGER,7839,1981-04-02,2975,,20
7654,MARTIN,SALESMAN,7698,1981-09-28,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850,,30
7782,CLARK,MANAGER,7839,1981-06-09,2450,,10
7788,SCOTT,ANALYST,7566,1987-04-19,3000,,20
7839,KING,PRESIDENT,,1981-11-17,5000,,10
7844,TURNER,SALESMAN,7698,1981-09-08,1500,0,30
7876,ADAMS,CLERK,7788,1987-05-23,1100,,20
7900,JAMES,CLERK,7698,1981-12-03,950,,30
7902,FORD,ANALYST,7566,1981-12-03,3000,,20
7934,MILLER,CLERK,7782,1982-01-23,1300,,10

CREATE TABLE IF NOT EXISTS hive_emp(EMPNO int,ENAME string,JOB string,MGR int,HIREDATE date,SAL int,COMM int,DEPTNO int)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
LOAD DATA LOCAL  INPATH '/hive/emp.log' OVERWRITE INTO TABLE hive_emp;
SELECT * FROM hive_emp;

需求,求每个部门的薪水前2位

SELECT deptno,empno,ename,sal FROM
(
SELECT deptno,empno,ename,sal,ROW_NUMBER()  OVER(PARTITION BY deptno ORDER BY sal DESC) rn FROM hive_emp
) tmp WHERE  tmp.rn<=2

beeline

beeline是hive推荐的连接方式,旧的连接可能会被淘汰,配合hiveserver2(HS2)一起使用

beeline --help
# ---output
Usage: java org.apache.hive.cli.beeline.BeeLine 
   -u <database url>               the JDBC URL to connect to 连接jdbc url
   -n <username>                   the username to connect as 用户名
   -p <password>                   the password to connect as 密码
   -d <driver class>               the driver class to use 驱动
   -i <init file>                  script file for initialization
   -e <query>                      query that should be executed
   -f <exec file>                  script file that should be executed
   -w (or) --password-file <password file>  the password file to read password from
   --hiveconf property=value       Use value for given property
   --hivevar name=value            hive variable name and value
                                   This is Hive specific settings in which variables
                                   can be set at session level and referenced in Hive
                                   commands or queries.
   --color=[true/false]            control whether color is used for display
   --showHeader=[true/false]       show column names in query results
   --headerInterval=ROWS;          the interval between which heades are displayed
   --fastConnect=[true/false]      skip building table/column list for tab-completion
   --autoCommit=[true/false]       enable/disable automatic transaction commit
   --verbose=[true/false]          show verbose error messages and debug info
   --showWarnings=[true/false]     display connection warnings
   --showNestedErrs=[true/false]   display nested errors
   --numberFormat=[pattern]        format numbers using DecimalFormat pattern
   --force=[true/false]            continue running script even after errors
   --maxWidth=MAXWIDTH             the maximum width of the terminal
   --maxColumnWidth=MAXCOLWIDTH    the maximum width to use when displaying columns
   --silent=[true/false]           be more silent
   --autosave=[true/false]         automatically save preferences
   --outputformat=[table/vertical/csv2/tsv2/dsv/csv/tsv]  format mode for result display
                                   Note that csv, and tsv are deprecated - use csv2, tsv2 instead
  --truncateTable=[true/false]    truncate table column when it exceeds length
   --delimiterForDSV=DELIMITER     specify the delimiter for delimiter-separated values output format (default: |)
   --isolation=LEVEL               set the transaction isolation level 事务级别
   --nullemptystring=[true/false]  set to true to get historic behavior of printing null as empty string
   --help                          display this message

# 启动Hiveserver2
hiveserver2

# 启动beeline
beeline 

#连接本地数据库,使用的是default,需要输入用户名和密码
!connect jdbc:hive2://hadoop001:10000

# 或者
beeline -u jdbc:hive2://hadoop001:10000/hive_test -n hadoop

Complex Types 复杂数据类型

arrays: ARRAY<data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)
maps: MAP<primitive_type, data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)
structs: STRUCT<col_name : data_type [COMMENT col_comment], ...>
union: UNIONTYPE<data_type, data_type, ...> (Note: Only available starting with Hive 0.7.0.)

array

准备数据array.log

zhangsan	football,swimming,game
lisi	basketball,ride bike,reading,game

# 创建表
CREATE TABLE t_array(name string,hobby array<string>) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ','; 

# 导入数据
LOAD DATA LOCAL INPATH '/hive/array.log' OVERWRITE INTO TABLE t_array;

SELECT * FROM t_array;

# 获取数组的某个下标的元素,下标从0开始
SELECT hobby[0],hobby[1] FROM t_array;

# 获取数组包含某个关键字的记录
SELECT * FROM t_array WHERE array_contains(hobby,'basketball');

map

准备数据map.log

zhangsan	father:lisi&mother:wangwu&sister:zhaoliu	20
lisi	father:zhangsan&mother:xiaoba	56

CREATE TABLE t_map(name string,family map<string,string>,age int)
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '&'
MAP KEYS TERMINATED BY ':';

LOAD DATA LOCAL INPATH '/hive/map.log' OVERWRITE INTO TABLE t_map;

SELECT * FROM t_map;

# 获取map中的key数组
SELECT name,map_keys(family) FROM t_map;

# 获取map中的value数组
SELECT name,map_values(family) FROM t_map;

# 获取map中包含某个key的
SELECT * FROM t_map WHERE array_contains(map_keys(family),'sister');

# 获取map中某个key的值
SELECT name,family['father'],family['mother'],family['sister'] FROM t_map;

struct

准备数据struct.log

1	apple&6&100&50
2	origin&4&50&35

CREATE TABLE t_struct(id int,data struct<name:string,price:int,store:int,sale:int>)
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '&';

LOAD DATA LOCAL INPATH '/hive/struct.log' OVERWRITE INTO TABLE t_struct;

SELECT * FROM t_struct;

Partition分区

单个表数据量太大之后,查询效率很低,因此需要引入分区.引入分区以后,虽然还是一个表,但是查询的时候,可以指定分区,可以提高查询效率

# 创建表时指定分区 PARTITIONED BY
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name    -- (Note: TEMPORARY available in Hive 0.14.0 and later)
  [(col_name data_type [COMMENT col_comment], ... [constraint_specification])]
  [COMMENT table_comment]
  [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
  [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
  [SKEWED BY (col_name, col_name, ...)                  -- (Note: Available in Hive 0.10.0 and later)]
     ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
     [STORED AS DIRECTORIES]
  [
   [ROW FORMAT row_format] 
   [STORED AS file_format]
     | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)]  -- (Note: Available in Hive 0.6.0 and later)
  ]
  [LOCATION hdfs_path]
  [TBLPROPERTIES (property_name=property_value, ...)]   -- (Note: Available in Hive 0.6.0 and later)
  [AS select_statement];   -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)
 
#修改表的分区
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];

# 删除分区
ALTER TABLE table_name DROP [IF EXISTS] PARTITION partition_spec[, PARTITION partition_spec, ...]
  [IGNORE PROTECTION] [PURGE];            -- (Note: PURGE available in Hive 1.2.0 and later, IGNORE PROTECTION not available 2.0.0 and later)
  
# 直接在hdfs上创建分区之后,hive元数据是没有的,因此需要执行metastore check来刷新一下元数据
MSCK [REPAIR] TABLE table_name [ADD/DROP/SYNC PARTITIONS];

# 分区的字段不能和table表的字段重复,否则报错 Column repeated in partitioning columns
# 创建动态分区
# 动态分区需要设置hive.exec.dynamic.partition.mode=nonstrict;
# set hive.exec.dynamic.partition.mode=nonstrict;
# 否则会报错 Dynamic partition strict mode requires at least one static partition column
# 动态分区,不需要指定分区的名称,直接partition by,插入时,需要指定partition的字段,select也要有
CREATE TABLE IF NOT EXISTS hive_emp_partition
(EMPNO int,ENAME string,JOB string,MGR int,HIREDATE date,SAL int,COMM int)
PARTITIONED BY(DEPTNO int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
# 动态分区插入
INSERT OVERWRITE TABLE hive_emp_partition partition(DEPTNO)
SELECT EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO from emp;

# 创建静态分区
CREATE TABLE emp_partition_static(
empno int,ename string,job string,mgr int,hiredate date,sal int,comm int
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
# 静态分区插入需要指定插入到哪个分区去
# 分区路径/user/hive/warehouse/emp_partition_static/deptno=30
INSERT INTO TABLE emp_partition_static PARTITION(deptno=30)
SELECT empno,ename,job,mgr,hiredate,sal,comm FROM emp where deptno=30;