Hive - function & complex type & partition
Function 函数
json_tuple
json_tuple和get_json_object类似,但是可以一次取多个值,效率要高
A new json_tuple() UDTF is introduced in Hive 0.7. It takes a set of names (keys) and a JSON string, and returns a tuple of values using one function. This is much more efficient than calling GET_JSON_OBJECT to retrieve more than one key from a single JSON string. In any case where a single JSON string would be parsed more than once, your query will be more efficient if you parse it once, which is what JSON_TUPLE is for. As JSON_TUPLE is a UDTF, you will need to use the LATERAL VIEW syntax in order to achieve the same goal.
For example,
select a.timestamp, get_json_object(a.appevents, '$.eventid'), get_json_object(a.appenvets, '$.eventname') from log a;
should be changed to:
select a.timestamp, b.* from log a lateral view json_tuple(a.appevent, 'eventid', 'eventname') b as f1, f2;
准备数据 movie_rate.log
{"rate":"7.5","cover_x":2250,"title":"向往的生活 第三季","url":"https://movie.douban.com/subject/30441625/","playable":true,"cover":"https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2553367338.webp","id":"30441625","cover_y":4000,"is_new":false}
{"rate":"5.6","cover_x":1080,"title":"中国新说唱 第二季","url":"https://movie.douban.com/subject/30357985/","playable":true,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556109024.webp","id":"30357985","cover_y":1920,"is_new":false}
{"rate":"8.2","cover_x":800,"title":"吹落的树叶","url":"https://movie.douban.com/subject/30438479/","playable":false,"cover":"https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2550315269.webp","id":"30438479","cover_y":1131,"is_new":false}
{"rate":"3.1","cover_x":1080,"title":"流淌的美好时光","url":"https://movie.douban.com/subject/27031380/","playable":true,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2560670445.webp","id":"27031380","cover_y":1656,"is_new":true}
{"rate":"8.6","cover_x":750,"title":"请输入搜索词:WWW","url":"https://movie.douban.com/subject/30403333/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556419121.webp","id":"30403333","cover_y":1062,"is_new":false}
{"rate":"8.8","cover_x":694,"title":"名姝 第三季","url":"https://movie.douban.com/subject/30333874/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2562273006.webp","id":"30333874","cover_y":1000,"is_new":false}
{"rate":"8.3","cover_x":945,"title":"白色强人","url":"https://movie.douban.com/subject/27195042/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2559049945.webp","id":"27195042","cover_y":1350,"is_new":false}
{"rate":"8.5","cover_x":803,"title":"春夜","url":"https://movie.douban.com/subject/30428225/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2554598542.webp","id":"30428225","cover_y":1200,"is_new":false}
{"rate":"8.7","cover_x":1688,"title":"圆桌派 第四季","url":"https://movie.douban.com/subject/30269908/","playable":true,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2558848644.webp","id":"30269908","cover_y":3000,"is_new":false}
{"rate":"8.2","cover_x":1013,"title":"亢奋 第一季","url":"https://movie.douban.com/subject/30181462/","playable":false,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2556252082.webp","id":"30181462","cover_y":1500,"is_new":false}
{"rate":"7.2","cover_x":1080,"title":"心动的信号 第二季","url":"https://movie.douban.com/subject/30458142/","playable":true,"cover":"https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2562668422.webp","id":"30458142","cover_y":1920,"is_new":false}
# 创建原始数据表
CREATE TABLE movie_raw (data string);
# 导入数据
LOAD DATA LOCAL INPATH '/hive/movie_rate.log' OVERWRITE INTO TABLE movie_raw;
# 从原始数据表中提取json
CREATE TABLE movie_rate AS SELECT json_tuple(data,'id','title','rate','url') AS (id,title,rate,url) FROM movie_raw;
# 查看提取情况
SELECT * FROM movie_rate;
parse_url_tuple
parse_url_tuple和parse_url类似,但是可以精确取出指定url中的各个部分,并返回指定的元组,所有的输出都是string类型
The parse_url_tuple() UDTF is similar to parse_url(), but can extract multiple parts of a given URL, returning the data in a tuple. Values for a particular key in QUERY can be extracted by appending a colon and the key to the partToExtract argument, for example, parse_url_tuple(‘http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1’, ‘QUERY:k1’, ‘QUERY:k2’) returns a tuple with values of ‘v1’,‘v2’. This is more efficient than calling parse_url() multiple times. All the input parameters and output column types are string.
SELECT b.* FROM src LATERAL VIEW parse_url_tuple(fullurl, 'HOST', 'PATH', 'QUERY', 'QUERY:id') b as host, path, query, query_id LIMIT 1;
SELECT parse_url_tuple(url,'HOST','PATH','QUERY') AS (base_url,relative_path,parameter) FROM movie_rate;
LanguageManual WindowingAndAnalytics 窗口函数
hive窗口函数官网链接
准备数据
emp.log
7369,SMITH,CLERK,7902,1980-12-17,800,,20
7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300,30
7521,WARD,SALESMAN,7698,1981-02-22,1250,500,30
7566,JONES,MANAGER,7839,1981-04-02,2975,,20
7654,MARTIN,SALESMAN,7698,1981-09-28,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850,,30
7782,CLARK,MANAGER,7839,1981-06-09,2450,,10
7788,SCOTT,ANALYST,7566,1987-04-19,3000,,20
7839,KING,PRESIDENT,,1981-11-17,5000,,10
7844,TURNER,SALESMAN,7698,1981-09-08,1500,0,30
7876,ADAMS,CLERK,7788,1987-05-23,1100,,20
7900,JAMES,CLERK,7698,1981-12-03,950,,30
7902,FORD,ANALYST,7566,1981-12-03,3000,,20
7934,MILLER,CLERK,7782,1982-01-23,1300,,10
CREATE TABLE IF NOT EXISTS hive_emp(EMPNO int,ENAME string,JOB string,MGR int,HIREDATE date,SAL int,COMM int,DEPTNO int)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/hive/emp.log' OVERWRITE INTO TABLE hive_emp;
SELECT * FROM hive_emp;
需求,求每个部门的薪水前2位
SELECT deptno,empno,ename,sal FROM
(
SELECT deptno,empno,ename,sal,ROW_NUMBER() OVER(PARTITION BY deptno ORDER BY sal DESC) rn FROM hive_emp
) tmp WHERE tmp.rn<=2
beeline
beeline是hive推荐的连接方式,旧的连接可能会被淘汰,配合hiveserver2(HS2)一起使用
beeline --help
# ---output
Usage: java org.apache.hive.cli.beeline.BeeLine
-u <database url> the JDBC URL to connect to 连接jdbc url
-n <username> the username to connect as 用户名
-p <password> the password to connect as 密码
-d <driver class> the driver class to use 驱动
-i <init file> script file for initialization
-e <query> query that should be executed
-f <exec file> script file that should be executed
-w (or) --password-file <password file> the password file to read password from
--hiveconf property=value Use value for given property
--hivevar name=value hive variable name and value
This is Hive specific settings in which variables
can be set at session level and referenced in Hive
commands or queries.
--color=[true/false] control whether color is used for display
--showHeader=[true/false] show column names in query results
--headerInterval=ROWS; the interval between which heades are displayed
--fastConnect=[true/false] skip building table/column list for tab-completion
--autoCommit=[true/false] enable/disable automatic transaction commit
--verbose=[true/false] show verbose error messages and debug info
--showWarnings=[true/false] display connection warnings
--showNestedErrs=[true/false] display nested errors
--numberFormat=[pattern] format numbers using DecimalFormat pattern
--force=[true/false] continue running script even after errors
--maxWidth=MAXWIDTH the maximum width of the terminal
--maxColumnWidth=MAXCOLWIDTH the maximum width to use when displaying columns
--silent=[true/false] be more silent
--autosave=[true/false] automatically save preferences
--outputformat=[table/vertical/csv2/tsv2/dsv/csv/tsv] format mode for result display
Note that csv, and tsv are deprecated - use csv2, tsv2 instead
--truncateTable=[true/false] truncate table column when it exceeds length
--delimiterForDSV=DELIMITER specify the delimiter for delimiter-separated values output format (default: |)
--isolation=LEVEL set the transaction isolation level 事务级别
--nullemptystring=[true/false] set to true to get historic behavior of printing null as empty string
--help display this message
# 启动Hiveserver2
hiveserver2
# 启动beeline
beeline
#连接本地数据库,使用的是default,需要输入用户名和密码
!connect jdbc:hive2://hadoop001:10000
# 或者
beeline -u jdbc:hive2://hadoop001:10000/hive_test -n hadoop
Complex Types 复杂数据类型
- arrays:
ARRAY<data_type>
(Note: negative values and non-constant expressions are allowed as of Hive 0.14.) - maps:
MAP<primitive_type, data_type>
(Note: negative values and non-constant expressions are allowed as of Hive 0.14.) - structs:
STRUCT<col_name : data_type [COMMENT col_comment], ...>
- union:
UNIONTYPE<data_type, data_type, ...>
(Note: Only available starting with Hive 0.7.0.)
array
准备数据array.log
zhangsan football,swimming,game
lisi basketball,ride bike,reading,game
# 创建表
CREATE TABLE t_array(name string,hobby array<string>)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ',';
# 导入数据
LOAD DATA LOCAL INPATH '/hive/array.log' OVERWRITE INTO TABLE t_array;
SELECT * FROM t_array;
# 获取数组的某个下标的元素,下标从0开始
SELECT hobby[0],hobby[1] FROM t_array;
# 获取数组包含某个关键字的记录
SELECT * FROM t_array WHERE array_contains(hobby,'basketball');
map
准备数据map.log
zhangsan father:lisi&mother:wangwu&sister:zhaoliu 20
lisi father:zhangsan&mother:xiaoba 56
CREATE TABLE t_map(name string,family map<string,string>,age int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '&'
MAP KEYS TERMINATED BY ':';
LOAD DATA LOCAL INPATH '/hive/map.log' OVERWRITE INTO TABLE t_map;
SELECT * FROM t_map;
# 获取map中的key数组
SELECT name,map_keys(family) FROM t_map;
# 获取map中的value数组
SELECT name,map_values(family) FROM t_map;
# 获取map中包含某个key的
SELECT * FROM t_map WHERE array_contains(map_keys(family),'sister');
# 获取map中某个key的值
SELECT name,family['father'],family['mother'],family['sister'] FROM t_map;
struct
准备数据struct.log
1 apple&6&100&50
2 origin&4&50&35
CREATE TABLE t_struct(id int,data struct<name:string,price:int,store:int,sale:int>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '&';
LOAD DATA LOCAL INPATH '/hive/struct.log' OVERWRITE INTO TABLE t_struct;
SELECT * FROM t_struct;
Partition分区
单个表数据量太大之后,查询效率很低,因此需要引入分区.引入分区以后,虽然还是一个表,但是查询的时候,可以指定分区,可以提高查询效率
# 创建表时指定分区 PARTITIONED BY
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive 0.14.0 and later)
[(col_name data_type [COMMENT col_comment], ... [constraint_specification])]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive 0.10.0 and later)]
ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
[STORED AS DIRECTORIES]
[
[ROW FORMAT row_format]
[STORED AS file_format]
| STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive 0.6.0 and later)
]
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive 0.6.0 and later)
[AS select_statement]; -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)
#修改表的分区
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];
# 删除分区
ALTER TABLE table_name DROP [IF EXISTS] PARTITION partition_spec[, PARTITION partition_spec, ...]
[IGNORE PROTECTION] [PURGE]; -- (Note: PURGE available in Hive 1.2.0 and later, IGNORE PROTECTION not available 2.0.0 and later)
# 直接在hdfs上创建分区之后,hive元数据是没有的,因此需要执行metastore check来刷新一下元数据
MSCK [REPAIR] TABLE table_name [ADD/DROP/SYNC PARTITIONS];
# 分区的字段不能和table表的字段重复,否则报错 Column repeated in partitioning columns
# 创建动态分区
# 动态分区需要设置hive.exec.dynamic.partition.mode=nonstrict;
# set hive.exec.dynamic.partition.mode=nonstrict;
# 否则会报错 Dynamic partition strict mode requires at least one static partition column
# 动态分区,不需要指定分区的名称,直接partition by,插入时,需要指定partition的字段,select也要有
CREATE TABLE IF NOT EXISTS hive_emp_partition
(EMPNO int,ENAME string,JOB string,MGR int,HIREDATE date,SAL int,COMM int)
PARTITIONED BY(DEPTNO int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
# 动态分区插入
INSERT OVERWRITE TABLE hive_emp_partition partition(DEPTNO)
SELECT EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO from emp;
# 创建静态分区
CREATE TABLE emp_partition_static(
empno int,ename string,job string,mgr int,hiredate date,sal int,comm int
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
# 静态分区插入需要指定插入到哪个分区去
# 分区路径/user/hive/warehouse/emp_partition_static/deptno=30
INSERT INTO TABLE emp_partition_static PARTITION(deptno=30)
SELECT empno,ename,job,mgr,hiredate,sal,comm FROM emp where deptno=30;