-Hive-3-基础

最新推荐文章于 2025-10-28 21:21:09 发布

原创最新推荐文章于 2025-10-28 21:21:09 发布 · 273 阅读

0 ·

CC 4.0 BY-SA版权

若泽大数据=hive 专栏收录该内容

5 篇文章

订阅专栏

1准备数据

rating_json
在这里插入图片描述

2 导入

hive

#创建原始数据表
create table rating_json(json string);
#导入数据
 load data local inpath '/home/hadoop/data/rating.json' overwite into table rating_json;
 #查看
 select * from rating_json limit 10;

3使用json_tuple函数解析json

  select json_tuple(json,'movie','rate','time','userid') as (movie_id,rate,time,user_id) from rating_json limit10；

4大宽表

将时间戳处理成带有年月日的新大宽表，后续的统计分析都是基于这个rating_width表进行

 create table rating_width as
    select
    movie_id,rate,time,user_id,
    year(from_unixtime(cast(time as bigint))) as year,
    month(from_unixtime(cast(time as bigint))) as month,
    day(from_unixtime(cast(time as bigint))) as day,
    hour(from_unixtime(cast(time as bigint))) as hour,
    minute(from_unixtime(cast(time as bigint))) as minute,
    from_unixtime(cast(time as bigint)) as ts
    from
    (
    select 
    json_tuple(json,'movie','rate','time','userid') as (movie_id,rate,time,user_id) 
    from rating_json
    ) tmp
    ;
    select * from rating_width limit 10;

5 top

窗口分析函数
需求：统计每种性别中年龄最大的两条数据
先根据性别分组，然后根据年龄做降序，取前2条

select id,age,name,gender,r
from
(
select id,age,name,gender,
ROW_NUMBER() over(PARTITION BY gender order by age desc) as r
from hive_rownumber
) t where t.r<=2;

6 beeline

beeline是hive推荐的连接方式,旧的连接可能会被淘汰,配合hiveserver2(HS2)一起使用

[hadoop@hadoop001 bin]$ ./beeline --help
which: no hbase in (/home/hadoop/app/hive/bin:/home/hadoop/app/hadoop/bin:/home/hadoop/app/hadoop/sbin:/usr/home/hadoop/bin:/bin:/usr/java/jdk1.8.0_45/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/home/hadoop/bin)
Usage: java org.apache.hive.cli.beeline.BeeLine 
   -u <database url>               the JDBC URL to connect to           ##连接jdbc url
   -r                              reconnect to last saved connect url (in conjunction with !save)     
   -n <username>                   the username to connect as       ##用户名
   -p <password>                   the password to connect as        ##密码
   -d <driver class>               the driver class to use                 ##驱动
   -i <init file>                  script file for initialization
   -e <query>                      query that should be executed
   -f <exec file>                  script file that should be executed
   -w (or) --password-file <password file>  the password file to read password from
   --hiveconf property=value       Use value for given property
   --hivevar name=value            hive variable name and value
                                   This is Hive specific settings in which variables
                                   can be set at session level and referenced in Hive
                                   commands or queries.
   --property-file=<property-file> the file to read connection properties (url, driver, user, password) from
   --color=[true/false]            control whether color is used for display
   --showHeader=[true/false]       show column names in query results
   --headerInterval=ROWS;          the interval between which heades are displayed
   --fastConnect=[true/false]      skip building table/column list for tab-completion
   --autoCommit=[true/false]       enable/disable automatic transaction commit
   --verbose=[true/false]          show verbose error messages and debug info
   --showWarnings=[true/false]     display connection warnings
   --showNestedErrs=[true/false]   display nested errors
   --numberFormat=[pattern]        format numbers using DecimalFormat pattern
   --force=[true/false]            continue running script even after errors
   --maxWidth=MAXWIDTH             the maximum width of the terminal
   --maxColumnWidth=MAXCOLWIDTH    the maximum width to use when displaying columns
   --silent=[true/false]           be more silent
   --autosave=[true/false]         automatically save preferences
   --outputformat=[table/vertical/csv2/tsv2/dsv/csv/tsv]  format mode for result display
   --incrementalBufferRows=NUMROWS the number of rows to buffer when printing rows on stdout,
                                   defaults to 1000; only applicable if --incremental=true
                                   and --outputformat=table
   --truncateTable=[true/false]    truncate table column when it exceeds length
   --delimiterForDSV=DELIMITER     specify the delimiter for delimiter-separated values output format (default: |)
   --isolation=LEVEL               set the transaction isolation level
   --nullemptystring=[true/false]  set to true to get historic behavior of printing null as empty string
   --maxHistoryRows=MAXHISTORYROWS The maximum number of rows to store beeline history.
   --convertBinaryArrayToString=[true/false]    display binary column data as string or as byte array 
   --help                          display this message
 
   Example:
    1. Connect using simple authentication to HiveServer2 on localhost:10000
    $ beeline -u jdbc:hive2://localhost:10000 username password

    2. Connect using simple authentication to HiveServer2 on hs.local:10000 using -n for username and -p for password
    $ beeline -n username -p password -u jdbc:hive2://hs2.local:10012

    3. Connect using Kerberos authentication with hive/localhost@mydomain.com as HiveServer2 principal
    $ beeline -u "jdbc:hive2://hs2.local:10013/default;principal=hive/localhost@mydomain.com

    4. Connect using SSL connection to HiveServer2 on localhost at 10000
    $ beeline jdbc:hive2://localhost:10000/default;ssl=true;sslTrustStore=/usr/local/truststore;trustStorePassword=mytruststorepassword

    5. Connect using LDAP authentication
    $ beeline -u jdbc:hive2://hs2.local:10013/default <ldap-username> <ldap-password>


#启动Hiveserver2
bin/hiveserver2

#启动beeline
bin/beeline 

#连接本地数据库
    [hadoop@hadoop001 bin]$ ./beeline -u jdbc:hive2://hadoop001:10000/d7_hive -n hadoop

修改hs2的默认端口:
添加到hive-site.xml中

<property>
  <name>hive.server2.thrift.port</name>
  <value>10000</value>
</property>

复杂数据类型

arrays: ARRAY<data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)

maps: MAP<primitive_type, data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)

structs: STRUCT<col_name : data_type [COMMENT col_comment], ...>

union: UNIONTYPE<data_type, data_type, ...> (Note: Only available starting with Hive 0.7.0.)

array

#创建表
create table hive_array(name string, work_locations array<string>)
row format delimited fields terminated by '\t'
COLLECTION ITEMS TERMINATED BY ',';
#导入数据
load data local inpath '/home/hadoop/data/hive_array.txt' overwaite into table hive_array
#查询
select  * from hive_array

如何查询
 work_locations[index] : index from zero
 
select name,work_locations[0] from hive_array;

select name,size(work_locations)from hive_array;

select * from hive_array where array_contains(work_locations,"beijing")

map

create table hive_map(id int,name string, members map<string,string>, age int)
row format delimited fields terminated by ','
COLLECTION ITEMS TERMINATED BY '#'
MAP KEYS TERMINATED BY ':';

load data local inpath '/home/hadoop/data/hive_map.txt' overwaite into table hive_map

select  * from hive_map


select id,name,members["father'] from hive_map

select id,name,members["father'] as father,members['sister']as sister from hive_map

 获取map中的key数组
select id,map_keys(members) from hive-map;
获取map中的value数组
select id,map_values(members) from hive-map;

select id,map_values(members),size (members) from hive-map;

father:xiaoming#mother:xiaohuang#brother:xiaoxu members['key']

Struct

create table hive_struct(ip string, userinfo struct<name:string, age:int>)
row format delimited fields terminated by '#'
COLLECTION ITEMS TERMINATED BY ':';

 load data local inpath '/home/hadoop/data/hive_struct.txt' overwaite into table hive_struct
    
select  * from hive_struct

userinfo.xxx

Partition 分区表

Hive：分区表  partition
bigdata：天、小时；hive里面是一张表，只是不同的分区在不同的目录里面

一级分区

create table order_partiton(
order_no string,
order_time string
)
PARTITIONED BY (event_month string)
row format delimited fields terminated by '\t';

load data local inpath '/home/hadoop/data/order_created.txt' into table order_partiton PARTITION (event_month='2014-05');	
select * from order_partiton;
	
在HDFS上的数据存储目录：tablename/partition_column=partition_value
order_partiton/event_month=2014-05	
	
真正的表的字段是不包含分区字段的，分区字段只是HDFS上的文件夹的名称
select * from order_partiton where event_month='2014-05'

在生产上，一般是数据经过清洗后存放在HDFS目录上，然后将目录的数据加载到分区表中
load data inpath '' into table order_partiton PARTITION (event_month='2014-06');

多级分区

create table order_mulit_partiton(
order_no string,
order_time string
)
PARTITIONED BY (event_month string, step string)
row format delimited fields terminated by '\t';

load data local inpath '/home/hadoop/data/order_created.txt' into table order_mulit_partiton PARTITION (event_month='2014-05', step='1');	

 select * from order_mulit_partiton

需求

将相同部门的人写到一个分区里面去

静态分区

CREATE TABLE emp_partition(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

insert into table emp_partition partition(deptno=30)
select empno,ename,job,mgr,hiredate,sal,comm from ruozedata_emp where deptno=10;

select * from emp_partition where daptno=10;

动态分区

create table emp_dynamic_partition(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

设置非严格模式;set hive.exec.dynamic.partition.mode=nonstrict

insert overwrite table emp_dynamic_partition partition(deptno)
select empno,ename,job,mgr,hiredate,sal,comm,deptno from ruozedata_emp;


select * from emp_dynamic_partition  where deptno=XXX；

如何高性能的刷新分区数据

可以通过命令  MSCK [REPAIR] TABLE table_name ; 刷新分区
语法：ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];