启动HiveServer2
HiveServer2即HS2:Hive的服务端,端口默认是10000,如果想改端口设置该参数hive.server2.thrift.port;启动服务端hiveserver2 &,启动客户端beeline建议先cd $HIVE_HOME/bin中去:./beeline -u jdbc:hive2://hadoop01:10000/fei_hive -n hadoop(其中fei_hive是database,hadoop是linux用户)
JOIN
- Inner Join
select e.empno, e.ename, e.deptno, d.dname from emp e join dept d on e.deptno = d.deptno; +----------+----------+-----------+-------------+--+ | e.empno | e.ename | e.deptno | d.dname | +----------+----------+-----------+-------------+--+ | 7369 | SMITH | 20 | RESEARCH | | 7499 | ALLEN | 30 | SALES | | 7521 | WARD | 30 | SALES | | 7566 | JONES | 20 | RESEARCH | | 7654 | MARTIN | 30 | SALES | | 7698 | BLAKE | 30 | SALES | | 7782 | CLARK | 10 | ACCOUNTING | | 7788 | SCOTT | 20 | RESEARCH | | 7839 | KING | 10 | ACCOUNTING | | 7844 | TURNER | 30 | SALES | | 7876 | ADAMS | 20 | RESEARCH | | 7900 | JAMES | 30 | SALES | | 7902 | FORD | 20 | RESEARCH | | 7934 | MILLER | 10 | ACCOUNTING | +----------+----------+-----------+-------------+--+ 14 rows selected (14.97 seconds) - Left Join:查询左边不为NULL数据
select e.empno, e.ename, e.deptno, d.dname from emp e left join dept d on e.deptno = d.deptno; +----------+----------+-----------+-------------+--+ | e.empno | e.ename | e.deptno | d.dname | +----------+----------+-----------+-------------+--+ | 7369 | SMITH | 20 | RESEARCH | | 7499 | ALLEN | 30 | SALES | | 7521 | WARD | 30 | SALES | | 7566 | JONES | 20 | RESEARCH | | 7654 | MARTIN | 30 | SALES | | 7698 | BLAKE | 30 | SALES | | 7782 | CLARK | 10 | ACCOUNTING | | 7788 | SCOTT | 20 | RESEARCH | | 7839 | KING | 10 | ACCOUNTING | | 7844 | TURNER | 30 | SALES | | 7876 | ADAMS | 20 | RESEARCH | | 7900 | JAMES | 30 | SALES | | 7902 | FORD | 20 | RESEARCH | | 7934 | MILLER | 10 | ACCOUNTING | | 8888 | HIVE | NULL | NULL | +----------+----------+-----------+-------------+--+ 15 rows selected (12.381 seconds) - Right Join:查询右边不为NULL的数据
注意:如果不加上后面on的连接条件的话,就会查询出笛卡尔积,工作中切忌要加上select e.empno, e.ename, e.deptno, d.dname from emp e right join dept d on e.deptno = d.deptno; +----------+----------+-----------+-------------+--+ | e.empno | e.ename | e.deptno | d.dname | +----------+----------+-----------+-------------+--+ | 7782 | CLARK | 10 | ACCOUNTING | | 7839 | KING | 10 | ACCOUNTING | | 7934 | MILLER | 10 | ACCOUNTING | | 7369 | SMITH | 20 | RESEARCH | | 7566 | JONES | 20 | RESEARCH | | 7788 | SCOTT | 20 | RESEARCH | | 7876 | ADAMS | 20 | RESEARCH | | 7902 | FORD | 20 | RESEARCH | | 7499 | ALLEN | 30 | SALES | | 7521 | WARD | 30 | SALES | | 7654 | MARTIN | 30 | SALES | | 7698 | BLAKE | 30 | SALES | | 7844 | TURNER | 30 | SALES | | 7900 | JAMES | 30 | SALES | | NULL | NULL | NULL | OPERATIONS | +----------+----------+-----------+-------------+--+ 15 rows selected (12.967 seconds)
Hive数据类型中的复杂类型
-
数组:array ,注意数据类型要一样
create table hive_array(name string, work_locations array<string>) row format delimited fields terminated by '\t' COLLECTION ITEMS TERMINATED BY ','; LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_array.txt' into table hive_array; +------------------+----------------------------------------------+--+ | hive_array.name | hive_array.work_locations | +------------------+----------------------------------------------+--+ | jim | ["beijing","shanghai","tianjin","hangzhou"] | | fei | ["changchu","chengdu","wuhan","beijing"] | +------------------+----------------------------------------------+--+ 2 rows selected (0.042 seconds) select a.name,a.work_locations[1],a.work_locations[3] from hive_array a; +---------+-----------+-----------+--+ | a.name | _c1 | _c2 | +---------+-----------+-----------+--+ | jim | shanghai | hangzhou | | fei | chengdu | beijing | +---------+-----------+-----------+--+ 2 rows selected (0.059 seconds)array_contains(column, ‘value’)
select a.name,array_contains(a.work_locations,'hangzhou') from hive_array a; +---------+--------+--+ | a.name | _c1 | +---------+--------+--+ | jim | true | | fei | false | +---------+--------+--+ 2 rows selected (0.079 seconds) -
Map:MAP<primitive_type, data_type> kv
create table hive_map( id int, name string, members map<string,string>, age int ) row format delimited fields terminated by ',' COLLECTION ITEMS TERMINATED BY '#' MAP KEYS TERMINATED BY ':'; LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_map.txt' into table hive_map; +--------------+----------------+----------------------------------------------------+---------------+--+ | hive_map.id | hive_map.name | hive_map.members | hive_map.age | +--------------+----------------+----------------------------------------------------+---------------+--+ | 1 | zhangsan | {"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"} | 28 | | 2 | lisi | {"father":"mayun","mother":"huangyi","brother":"guanyu"} | 22 | | 3 | wangwu | {"father":"wangjianlin","mother":"ruhua","sister":"jingtian"} | 29 | | 4 | mayun | {"father":"mayongzhen","mother":"angelababy"} | 26 | +--------------+----------------+----------------------------------------------------+---------------+--+ 4 rows selected (0.049 seconds) select a.name,a.members["father"],a.members["brother"] from hive_map a; +-----------+--------------+---------+--+ | a.name | _c1 | _c2 | +-----------+--------------+---------+--+ | zhangsan | xiaoming | xiaoxu | | lisi | mayun | guanyu | | wangwu | wangjianlin | NULL | | mayun | mayongzhen | NULL | +-----------+--------------+---------+--+ 4 rows selected (0.04 seconds) 查询出有兄弟的人以及他的兄弟是谁??? select a.name,a.members["brother"] as brother from hive_map a where a.members["brother"] is not null; +-----------+----------+--+ | a.name | brother | +-----------+----------+--+ | zhangsan | xiaoxu | | lisi | guanyu | +-----------+----------+--+ 2 rows selected (0.043 seconds) -
structs: STRUCT<col_name : data_type [COMMENT col_comment], …>,内部数据类型可以不同
create table hive_struct( ip string, info struct<name:string,age:int> ) row format delimited fields terminated by '#' COLLECTION ITEMS TERMINATED BY ':'; LOAD DATA LOCAL INPATH '/home/hadoop/data/hive_struct.txt' into table hive_struct; +-----------------+-------------------------------+--+ | hive_struct.ip | hive_struct.info | +-----------------+-------------------------------+--+ | 192.168.1.1 | {"name":"zhangsan","age":40} | | 192.168.1.2 | {"name":"lisi","age":50} | | 192.168.1.3 | {"name":"wangwu","age":60} | | 192.168.1.4 | {"name":"zhaoliu","age":70} | +-----------------+-------------------------------+--+ 4 rows selected (0.037 seconds) select a.ip,a.info.name from hive_struct a where a.info.age >= 60; +--------------+----------+--+ | a.ip | name | +--------------+----------+--+ | 192.168.1.3 | wangwu | | 192.168.1.4 | zhaoliu | +--------------+----------+--+ 2 rows selected (0.056 seconds)
点击日志,查看点击广告
-
命令
广告表 create table ad_list( ad_id string, url string, catalogs string )row format delimited fields terminated by '\t'; LOAD DATA LOCAL INPATH '/home/hadoop/data/ad_list.txt' into table ad_list; +----------------+------------------------+--------------------------------------+--+ | ad_list.ad_id | ad_list.url | ad_list.catalogs | +----------------+------------------------+--------------------------------------+--+ | ad_101 | http://www.google.com | catalog8|catalog1 | | ad_102 | http://www.sohu.com | catalog6|catalog3 | | ad_103 | http://www.baidu.com | catalog7 | | ad_104 | http://www.qq.com | catalog5|catalog1|catalog4|catalog9 | | ad_105 | http://sina.com | NULL | +----------------+------------------------+--------------------------------------+--+ create table ad_list_2( ad_id string, url string, catalogs array<string> )row format delimited fields terminated by '\t' COLLECTION ITEMS TERMINATED BY '|'; LOAD DATA LOCAL INPATH '/home/hadoop/data/ad_list.txt' into table ad_list_2; +------------------+------------------------+------------------------------------------------+--+ | ad_list_2.ad_id | ad_list_2.url | ad_list_2.catalogs | +------------------+------------------------+------------------------------------------------+--+ | ad_101 | http://www.google.com | ["catalog8","catalog1"] | | ad_102 | http://www.sohu.com | ["catalog6","catalog3"] | | ad_103 | http://www.baidu.com | ["catalog7"] | | ad_104 | http://www.qq.com | ["catalog5","catalog1","catalog4","catalog9"] | | ad_105 | http://sina.com | NULL | +------------------+------------------------+------------------------------------------------+--+ 点击日志表 create table click_log( cookie_id string, ad_id string, time string )row format delimited fields terminated by '\t'; LOAD DATA LOCAL INPATH '/home/hadoop/data/click_log.txt' into table click_log; +----------------------+------------------+-----------------------------+--+ | click_log.cookie_id | click_log.ad_id | click_log.time | +----------------------+------------------+-----------------------------+--+ | 11 | ad_101 | 2014-05-01 06:01:12.334+01 | | 22 | ad_102 | 2014-05-01 07:28:12.342+01 | | 33 | ad_103 | 2014-05-01 07:50:12.33+01 | | 11 | ad_104 | 2014-05-01 09:27:12.33+01 | | 22 | ad_103 | 2014-05-01 09:03:12.324+01 | | 33 | ad_102 | 2014-05-02 19:10:12.343+01 | | 11 | ad_101 | 2014-05-02 09:07:12.344+01 | | 35 | ad_105 | 2014-05-03 11:07:12.339+01 | | 22 | ad_104 | 2014-05-03 12:59:12.743+01 | | 77 | ad_103 | 2014-05-03 18:04:12.355+01 | | 99 | ad_102 | 2014-05-04 00:36:39.713+01 | | 33 | ad_101 | 2014-05-04 19:10:12.343+01 | | 11 | ad_101 | 2014-05-05 09:07:12.344+01 | | 35 | ad_102 | 2014-05-05 11:07:12.339+01 | | 22 | ad_103 | 2014-05-05 12:59:12.743+01 | | 77 | ad_104 | 2014-05-05 18:04:12.355+01 | | 99 | ad_105 | 2014-05-05 20:36:39.713+01 | +----------------------+------------------+-----------------------------+--+ 查询每个人访问的所有ad_id select cookie_id, collect_list(ad_id) from click_log group by cookie_id; +------------+----------------------------------------+--+ | cookie_id | _c1 | +------------+----------------------------------------+--+ | 11 | ["ad_101","ad_104","ad_101","ad_101"] | | 22 | ["ad_102","ad_103","ad_104","ad_103"] | | 33 | ["ad_103","ad_102","ad_101"] | | 35 | ["ad_105","ad_102"] | | 77 | ["ad_103","ad_104"] | | 99 | ["ad_102","ad_105"] | +------------+----------------------------------------+--+ 去重 select cookie_id, collect_set(ad_id) from click_log group by cookie_id; +------------+-------------------------------+--+ | cookie_id | _c1 | +------------+-------------------------------+--+ | 11 | ["ad_101","ad_104"] | | 22 | ["ad_102","ad_103","ad_104"] | | 33 | ["ad_103","ad_102","ad_101"] | | 35 | ["ad_105","ad_102"] | | 77 | ["ad_103","ad_104"] | | 99 | ["ad_102","ad_105"] | +------------+-------------------------------+--+ 每个人点击某个广告的次数 select cookie_id,ad_id,count(1) amount from click_log group by cookie_id,ad_id; +------------+---------+---------+--+ | cookie_id | ad_id | amount | +------------+---------+---------+--+ | 11 | ad_101 | 3 | | 11 | ad_104 | 1 | | 22 | ad_102 | 1 | | 22 | ad_103 | 2 | | 22 | ad_104 | 1 | | 33 | ad_101 | 1 | | 33 | ad_102 | 1 | | 33 | ad_103 | 1 | | 35 | ad_102 | 1 | | 35 | ad_105 | 1 | | 77 | ad_103 | 1 | | 77 | ad_104 | 1 | | 99 | ad_102 | 1 | | 99 | ad_105 | 1 | +------------+---------+---------+--+ 每个人点击某个广告的次数以及广告分类 select a.*,b.catalogs from (select cookie_id,ad_id,count(1) amount from click_log group by cookie_id,ad_id) a left join ad_list b on a.ad_id = b.ad_id; +--------------+----------+-----------+--------------------------------------+--+ | a.cookie_id | a.ad_id | a.amount | b.catalogs | +--------------+----------+-----------+--------------------------------------+--+ | 11 | ad_101 | 3 | catalog8|catalog1 | | 11 | ad_104 | 1 | catalog5|catalog1|catalog4|catalog9 | | 22 | ad_102 | 1 | catalog6|catalog3 | | 22 | ad_103 | 2 | catalog7 | | 22 | ad_104 | 1 | catalog5|catalog1|catalog4|catalog9 | | 33 | ad_101 | 1 | catalog8|catalog1 | | 33 | ad_102 | 1 | catalog6|catalog3 | | 33 | ad_103 | 1 | catalog7 | | 35 | ad_102 | 1 | catalog6|catalog3 | | 35 | ad_105 | 1 | NULL | | 77 | ad_103 | 1 | catalog7 | | 77 | ad_104 | 1 | catalog5|catalog1|catalog4|catalog9 | | 99 | ad_102 | 1 | catalog6|catalog3 | | 99 | ad_105 | 1 | NULL | +--------------+----------+-----------+--------------------------------------+--+
行列转换
-
行转列
create table feidata_concat_ws( name string, dept string, grade string )row format delimited fields terminated by ','; concat_ws.txt fei,dept01,A jim,dept02,A gouzi,dept01,B jianren,dept01,A renzha,dept02,A LOAD DATA LOCAL INPATH '/home/hadoop/data/concat_ws.txt' into table feidata_concat_ws; select t1.dept_grade as dept_grade,concat_ws("|",collect_set(t1.name)) as names from ( select name,concat(dept,",",grade) as dept_grade from feidata_concat_ws ) t1 group by t1.dept_grade; dept_grade names dept01,A fei|jianren dept01,B gouzi dept02,A jim|renzha -
列转行
create table feidata_stu( id int, name string, subjects array<string> )row format delimited fields terminated by ',' collection items terminated by ':'; row_col.txt 1,zhangsan,化学:物理:数学:语文 2,lisi,化学:数学:生物:生理:卫生 3,wangwu,化学:语文:英语:体育:生物 LOAD DATA LOCAL INPATH '/home/hadoop/data/row_col.txt' into table feidata_stu; select id,name,subject from feidata_stu lateral view explode(subjects) tmp as subject; id name subject 1 zhangsan 化学 1 zhangsan 物理 1 zhangsan 数学 1 zhangsan 语文 2 lisi 化学 2 lisi 数学 2 lisi 生物 2 lisi 生理 2 lisi 卫生 3 wangwu 化学 3 wangwu 语文 3 wangwu 英语 3 wangwu 体育 3 wangwu 生物
函数:function
-
展示所有的函数show functions;
-
查看某个函数的用法desc function 函数名(desc function map_keys)
array_contains : 返回boolean,where后面使用
sort_array : 排序后的数组
size : 返回数值
map_keys : 返回一个数组
map_values : 返回一个数组select userid,movie,rate,time, from_unixtime(cast(time as bigint)) as ts, year(from_unixtime(cast(time as bigint))) as year, month(from_unixtime(cast(time as bigint))) as month, day(from_unixtime(cast(time as bigint))) as day, hour(from_unixtime(cast(time as bigint))) as hour, minute(from_unixtime(cast(time as bigint))) as minute, second(from_unixtime(cast(time as bigint))) as second from (select json_tuple(json, 'movie','rate','time','userid') as (movie, rate,time, userid) from rating_json ) t limit 10;
排序
- order by会对输入做全局排序,因此只有一个reducer,会导致当输入规模较大时,需要较长的计算时间
- sort by不是全局排序,其在数据进入reducer前完成排序。因此,如果用sort by进行排序,并且设置mapred.reduce.tasks>1,则sort by只保证每个reducer的输出有序,不保证全局有序,即每个文件是有序的。如果mapred.reduce.tasks=1,sort by结果和order by一样。一般sort by不单独使用,而是和distribute by一起使用。
- distribute by(字段)把根据指定的字段将数据分到不同的reducer,且分发算法是hash散列常和sort by排序一起使用。
1)distribute by负责把文件哈希散列到不同文件,sort by负责给不同文件进行排序
2)当然必须要设置set mareduce.job.reduces 数量,设置的reduces数量个数,就是实际hash散列的文件个数,因为hash 散列的是通过hash值与reduce个数取模决定存储在哪个文件里的。所以如果不设置 reduces个数,即使distribute by+sort by结果还是和 order by结果一样的 - cluster by(字段)除了具有distribute by的功能外,还会对该字段进行类似sort by的排序;因此分桶和sort字段是同一个时,此时,cluster by = distribute by + sort by
例子
-
order by全局排序 select * from emp where empno >7800 order by empno limit 3 ;(显示工号大于7800员工的最后三名。)只是针对所有数据全局排序,全局排序只有一个reducer,效率低,运行慢。
-
sort by排序,为每个reducer产生一个排序文件
1)设置 mapreduce.job.reduces 即reduce任务个数 ,如果为1跟order by 解果一样。 set mapreduce.job.reduces; 回车以后显示mapred.reduce.task的值 set mapreduce.job.reduces=3;临时设置的值,一旦hive关闭重启就没用了。 2)sort by是在reduce之间排序, insert overwrite local directory '/home/hadoop/data/emp_sort' row format delimited fields terminated by '\t' select empno,ename ,deptno from emp sort by empno; 结果是3个文件,随机分布的数据,然后给每个文件夹里的数据按empno进行了排序。 -rw-r--r-- 1 hadoop hadoop 98 Dec 27 11:40 000000_0 -rw-r--r-- 1 hadoop hadoop 84 Dec 27 11:40 000001_0 -rw-r--r-- 1 hadoop hadoop 27 Dec 27 11:40 000002_0 [hadoop@hadoop01 emp_sort]$ cat 000000_0 7654 MARTIN 30 7698 BLAKE 30 7782 CLARK 10 7788 SCOTT 20 7839 KING 10 7844 TURNER 30 8888 HIVE \N -
distributed 类似于mapreduce里的patition分区(根据指定值hash散列到不同文件)一般要结合sort by使用。
insert overwrite local directory '/home/hadoop/data/emp_distribute' row format delimited fields terminated by '\t' select empno,ename ,deptno from emp distribute by deptno sort by empno; -rw-r--r-- 1 hadoop hadoop 98 Dec 27 12:02 000000_0 -rw-r--r-- 1 hadoop hadoop 42 Dec 27 12:02 000001_0 -rw-r--r-- 1 hadoop hadoop 69 Dec 27 12:02 000002_0 [hadoop@hadoop01 emp_distribute]$ cat 000000_0 7499 ALLEN 30 7521 WARD 30 7654 MARTIN 30 7698 BLAKE 30 7844 TURNER 30 7900 JAMES 30 8888 HIVE \N 这里的输出结果是sort下面三个文件,分别是三个部门员工的文件,且每个部门按工号排序 -
cluster by :除了具有distribute by的分区的功能外,还会对该字段进行排序;当distribute和sort字段相同时,就是cluster by,即分区的字段和排序的字段相同
insert overwrite local directory '/home/hadoop/data/emp_cluster' row format delimited fields terminated by '\t' select empno,ename ,deptno from emp cluster by empno; 等价于 insert overwrite local directory '/home/hadoop/data/emp_cluster' row format delimited fields terminated by '\t' select empno,ename ,deptno from emp distribute by empno sort by empno; 注意:cluter by 可以用sort by + distribute by 替换,但反过来不一定可以替换
Hive实现WordCount
-
wordcount
create table feidata_wc( sentence string ); feidata_wc.txt jim,jim,fei fei,jack sean LOAD DATA LOCAL INPATH '/home/hadoop/data/feidata_wc.txt' into table feidata_wc; select word, count(1) cnt from ( select explode(split(sentence,',')) word from feidata_wc ) t group by word; word cnt fei 2 jack 1 jim 2 sean 1
本文深入探讨Hive SQL的高级特性,包括复杂数据类型的使用、JOIN操作的细节、行列转换技巧以及函数和排序功能的应用。通过具体实例,读者可以详细了解如何在大数据处理中高效利用这些特性。
296

被折叠的 条评论
为什么被折叠?



