一、SELECT
Join
inner join
left join | left outer join
right join | right outer join
full outer join
其中 join与inner是一样的结果
** MySQL中演示 **
例如有下边两张表

1、内连接
** 两表关联,保留两表中交集的记录 **


SELECT a.id,a.sum,b.id,b.sum
from t_a a join t_b b
where a.id = b.id;
#简写
SELECT a.*,b.*
from t_a a join t_b b
where a.id = b.id;

2、左连接
** 左表全部保留,左表关联不上的用null表示 **

SELECT a.*,b.*
from t_a a left join t_b b
on a.id = b.id;

3、右连接
** 右表全部保留,左表关联不上的用null表示 **

SELECT a.*,b.*
from t_a a right join t_b b
on a.id = b.id;

left outer join 与left join 结果一样
right outer join 与 right join结果一样
4、笛卡尔积
两表关联,把左表的列和右表的列通过笛卡尔积的形式表达出来
SELECT * from t_a a join t_b b;

5、左表独有
两表关联,查询左表独有的数据

SELECT a.*,b.*
from t_a a left join t_b b on a.id=b.id
where b.id is null;

6、右表独有
两表关联,查询右表独有的数据

SELECT a.*,b.*
from t_a a right join t_b b on a.id=b.id
where a.id is null;

7、全表独有
两表关联,查询它们的所有记录

#oracle里面有full join,但是在mysql中没有full join,我们可以使用union来达到目的
SELECT a.*,b.* from t_a a left join t_b b on a.id=b.id
union
SELECT a.*,b.* from t_a a right join t_b b on a.id=b.id;

在hive里有full outer join的mysql没有

8、并集去交集

SELECT a.*,b.* from t_a a LEFT JOIN t_b b ON a.id=b.id where b.id is NULL
UNION
SELECT a.*,b.* from t_a a RIGHT JOIN t_b b on a.id=b.id WHERE a.id IS NULL;

补充:
Hive里有一个特殊的
left semi join 可以提高exist | in 这种查询需求的效率
本查询中,无法取到右表的数据
select a.* from t_a a left semi join t_b b on a.id=b.id;

select a.,b. from t_a a left semi join t_b b on a.id=b.id; 会报错
所以叫左半链接,等同于
select a.* from t_a a join t_b b on a.id=b.id;

这样效率会更高,直接join会产生笛卡尔积,在过滤满足条件的数据,这个left semi join直接过滤需要的数据
老版本中,不支持非等值的join
在新版中:1.2.0后,都支持非等值join,不过写法应该如下:
select a.,b. from t_a a,t_b b where a.id>b.id;
不支持的语法: select a.,b. from t_a a join t_b b on a.id>b.id;
二、小技巧
#在hive中执行linux命令
hive> !ls /root;
aaa
anaconda-ks.cfg
haha.txt
hdp-blocks
hdp-meta
hdptmp
install.log
install.log.syslog
#在hive中显示字段名
hive> set hive.cli.print.header=true;
hive> set hive.resultset.use.unique.column.names=false;
三、函数
#建表(以备后续测试函数)
hive> create table dual(id string);
hive> insert into table dual values(1);
#测试函数substr怎么使用
hive> select substr("abcd",0,2) from dual;
OK
_c0
ab
1、日期函数
1)时间戳:
从格林威治时间1970年01月01日00时00分00秒(北京时间1970年01月01日08时00分00秒)起至现在的总毫秒数
#获取日期
hive> select current_date from dual;
OK
_c0
2019-08-29
#获取时间
hive> select current_timestamp from dual;
OK
_c0
2019-08-29 10:56:44.692
#获取当前时间戳
hive> select unix_timestamp() from dual;
OK
_c0
1567047462
#获取设定时间时间戳
hive> select unix_timestamp('2019-05-07 13:01:03') from dual;
OK
_c0
1557205263
hive> select unix_timestamp('20190507 13:01:03','yyyyMMdd HH:mm:ss') from dual;
OK
_c0
1557205263
#将时间戳转换为时间
hive> select from_unixtime(1557205263,'yyyy-MM-dd HH:mm:ss') from dual;
OK
_c0
2019-05-07 13:01:03
2)获取时间
#获取年份
hive> select year ('2011-12-08 10:03:01')from dual;
OK
_c0
2011
hive> select year ('2011-12-08')from dual;
OK
_c0
2011
#获取月份
hive> select month ('2011-12-08 10:03:01')from dual;
OK
_c0
12
hive> select month ('2011-12-08')from dual;
OK
_c0
12
#获取日期
hive> select day ('2011-12-08 10:03:01')from dual;
OK
_c0
8
hive> select day ('2011-12-08')from dual;
OK
_c0
8
#获取小时
hive> select hour ('2011-12-08 10:03:01')from dual;
OK
_c0
10
#获取分钟
hive> select minute ('2011-12-08 10:03:01')from dual;
OK
_c0
3
#获取秒数
hive> select second ('2011-12-08 10:03:01')from dual;
OK
_c0
1
3)日期增减
#获取十天后的日期
hive> select date_add('2012-12-08',10)from dual;
OK
_c0
2012-12-18
#获取十天前的日期
hive> select date_sub('2012-12-08',10)from dual;
OK
_c0
2012-11-28
2、Json函数(电影实例)
#本地运行hive
hive> set hive.exec.mode.local.auto=true;(默认为false)

1)准备
#将数据rating.json上传到hdp03的/home下
[root@node01 home]# rz
#在hive中先创建一张表,将一行的json看做一个字段
hive> create table t_rate_json(line string) row format delimited;
#导入数据
hive> load data local inpath "/home/rating.json" into table t_rate_json;
#创建一张表,存储解析后的数据
hive> create table t_rate(movie string,rate int,ts string,uid string)
> row format delimited fields terminated by '\001';
#解析json函数使用get_json_object函数
#测试(解析出电影名,并取前两行)
hive> select get_json_object(line,"$.movie") from t_rate_json limit 2;
OK
_c0
1193
661
2)解析
#解析并存入准备的表中
hive> insert into table t_rate
> select get_json_object(line,'$.movie'),
> get_json_object(line,'$.rate'),
> get_json_object(line,'$.timeStamp'),
> get_json_object(line,'$.uid')
> from t_rate_json;
#查看解析后文件的前十行(文件过大)
hive> select * from t_rate limit 10;
OK
1193 5 978300760 1
661 3 978302109 1
914 3 978301968 1
3408 4 978300275 1
2355 5 978824291 1
1197 3 978302268 1
1287 5 978302039 1
2804 5 978300719 1
594 4 978302268 1
919 4 978301368 1
3)具体操作
#统计评分大于3的所有记录
hive> select * from t_rate where rate > 3;

#统计每个人的评分总数
hive> select uid,sum(rate)from t_rate group by uid;

#统计每个人的评分总数,倒序排列
hive> select uid,sum(rate)rate_sum from t_rate group by uid order by rate_sum desc;

#统计每个人的评分总数,倒序排列,取前十
hive> select uid,sum(rate)rate_sum from t_rate group by uid order by rate_sum limit 10;
OK
4636 50
4365 51
4056 51
4349 53
5828 54
3642 54
4486 54
4547 56
3488 59
5159 59
4)另一种解析json的方法(tuple)
①测试
hive> select json_tuple(line,"movie","rate","timeStamp","uid")
> as(movie,rate,ts,uid)
> from t_rate_json
> limit 10;
OK
1193 5 978300760 1
661 3 978302109 1
914 3 978301968 1
3408 4 978300275 1
2355 5 978824291 1
1197 3 978302268 1
1287 5 978302039 1
2804 5 978300719 1
594 4 978302268 1
919 4 978301368 1
②解析
hive> create table t_rate_a
> as
> select uid,movie,rate,
> year(from_unixtime(cast(ts as bigint)))as year,
> month(from_unixtime(cast(ts as bigint)))as month,
> day(from_unixtime(cast(ts as bigint)))as day,
> hour(from_unixtime(cast(ts as bigint)))as hour,
> minute(from_unixtime(cast(ts as bigint)))as minute,
> from_unixtime(cast(ts as bigint))as ts
> from
> (select
> json_tuple(line,'movie','rate','timeStamp','uid')as(movie,rate,ts,uid)
> from t_rate_json)tmp;
hive> select * from t_rate_a limit 10;
OK
1 1193 5 2001 1 1 6 12 2001-01-01 06:12:40
1 661 3 2001 1 1 6 35 2001-01-01 06:35:09
1 914 3 2001 1 1 6 32 2001-01-01 06:32:48
1 3408 4 2001 1 1 6 4 2001-01-01 06:04:35
1 2355 5 2001 1 7 7 38 2001-01-07 07:38:11
1 1197 3 2001 1 1 6 37 2001-01-01 06:37:48
1 1287 5 2001 1 1 6 33 2001-01-01 06:33:59
1 2804 5 2001 1 1 6 11 2001-01-01 06:11:59
1 594 4 2001 1 1 6 37 2001-01-01 06:37:48
1 919 4 2001 1 1 6 22 2001-01-01 06:22:48
③分组topn
hive> select *,row_number()over(partition by uid order by rate desc)as rank from t_rate;

hive> select uid,movie,rate,ts
> from
> (select uid,movie,rate,ts,row_number() over(partition by uid order by rate desc) as rank from t_rate) tmp
> where rank<=3;

3、网址解析
例如网址:http://www.baidu.com/find?cookieid=4234234234
解析成:www.baidu.com /find cookieid 4234234234
hive> select parse_url_tuple("http://www.baidu.com/find?cookieid=4234234234",'HOST','PATH','QUERY','QUERY:cookieid')
> from dual;
OK
www.baidu.com /find cookieid=4234234234 4234234234
4、explode 和 lateral view
1)准备
#创建文本
[root@node01 home]# vi student.txt
1,zhangsan,数学:语文:英语:生物
2,lisi,数学:语文
3,wangwu,化学:计算机:java
#创建表
hive> create table t_xuanxiu(uid string,name string,kc array<string>)
> row format delimited
> fields terminated by','
> collection items terminated by':';
#加载数据
hive> load data local inpath "/home/student.txt"into table t_xuanxiu;
#查看
hive> select * from t_xuanxiu;
OK
1 zhangsan ["数学","语文","英语","生物"]
2 lisi ["数学","语文"]
3 wangwu ["化学","计算机","java"]
hive> select uid,name,kc[0] from t_xuanxiu;
OK
1 zhangsan 数学
2 lisi 数学
3 wangwu 化学
2)测试
希望得到:
1,zhangsan,数学
1,zhangsan,语文
1,zhangsan,英语
1,zhangsan,生物
2,lisi,数学
2,lisi,语文
…
#将数组变成一个列
hive> select explode(kc) from t_xuanxiu where uid=1;
OK
数学
语文
英语
生物
引申为:select uid,name,explode(kc)fromt t_xuanxiu where uid=1;
但是会报错
我们可以用lateral view(表生成函数)
lateral view为侧视图,意义是为了配合UDTF来使用,把某一行数据拆分成多行数据.不加lateral view的UDTF只能提取单个字段拆分,并不能塞会原来数据表中.加上lateral view就可以将拆分的单个字段数据与原始表数据关联上.
在使用lateral view的时候需要指定视图别名和生成的新列别名
代码如下:
#lateral view 将 explode(kc) 看成一个表是 tmp 就一个字段as course;
hive> select uid,name,tmp.course from t_xuanxiu
> lateral view explode(kc)tmp as course;
OK
1 zhangsan 数学
1 zhangsan 语文
1 zhangsan 英语
1 zhangsan 生物
2 lisi 数学
2 lisi 语文
3 wangwu 化学
3 wangwu 计算机
3 wangwu java
3)wordcount示例
①准备
[root@node01 home]# vi words.txt
a b c d e f g
a b c
e f g a
b c d b
hive> create table t_juzi(line string) row format delimited;
hive> load data local inpath"/home/words.txt"into table t_juzi;
hive> select * from t_juzi;
OK
a b c d e f g
a b c
e f g a
b c d b
hive> select explode(split(line,' '))from t_juzi;
OK
a
b
c
d
e
f
g
a
b
c
e
f
g
a
b
c
d
b
#将打散的结果看成一个表
hive> select tmp.*from t_juzi lateral view explode(split(line,''))tmp as word;
OK
a
b
c
d
e
f
g
a
b
c
e
f
g
a
b
c
d
b
#计数
hive> select a.word,count(1) sum
> from
> (select tmp.*from t_juzi lateral view explode(split(line,''))tmp as word)a
> group by a.word
> Order by sum desc;
OK
b 4
c 3
a 3
g 2
f 2
e 2
d 2
#倒序排序
hive> select a.word,count(1) sum
> from
> (select tmp.* from t_juzi lateral view explode(split(line,' ')) tmp as word) a
> group by a.word
> order by sum desc;
OK
b 4
c 3
a 3
g 2
f 2
e 2
d 2
5、rownumber() 和 over()函数
需求:求每个人前两高的分数
1)准备
[root@node01 home]# vi score.txt
zhangsan,1,90,2
zhangsan,2,95,1
zhangsan,3,68,3
lisi,1,88,3
lisi,2,95,2
lisi,3,98,1
hive> create table t_score(name string,kcid string,score int)
> row format delimited
> fields terminated by',';
hive> load data local inpath'/home/score.txt'into table t_score;
2)测试
hive> select *,row_number() over(partition by name order by score desc) rank from t_score;
OK
lisi 3 98 1
lisi 2 95 2
lisi 1 88 3
zhangsan 2 95 1
zhangsan 1 90 2
zhangsan 3 68 3
hive> select name,kcid,score
> from
> (select *,row_number() over(partition by name order by score desc)as rank from t_score)tmp
> where rank<3;
OK
lisi 3 98
lisi 2 95
zhangsan 2 95
zhangsan 1 90
#每个用户评分最高的3部电影
hive> create table t_rate_topn_uid
> as
> select uid,movie,rate,ts
> from
> (select *,row_number() over(partition by uid order by rate desc) as rank from t_rate) tmp
> where rank<11;
hive> select * from t_rate_topn_uid where uid=1;
OK
1 1035 5 978301753
1 1 5 978824268
1 2028 5 978301619
1 3105 5 978301713
1 527 5 978824195
1 1961 5 978301590
1 1029 5 978302205
1 1022 5 978300055
1 595 5 978824268
1 48 5 978824351
6、自定义函数
1)准备
[root@node01 home]# vi user.txt
1,zhangsan:20-1999063017:30:00-beijing
2,lisi:30-1989063017:30:00-shanghai
3,wangwu:22-1997063017:30:00-neimeng
hive> create table user_info(info string)
hive> load data local inpath'/home/user.txt'into table user_info;
需求: 利用上表生成表t_user(uid,name,age,birthday,address)
思路: 可以自定义一个函数parse_user_info,能传入一行数据,返回切分好的字段
2)步骤
①编写自定义函数
写一个java类实现函数所需要的功能
public class UserInfoParser extends UDF{
public String evaluate(String line,int index) {
String newLine = line.replaceAll(",", "\001").replaceAll(":", "\001").replaceAll("-", "\001");
StringBuffer sb = new StringBuffer();
String[] split = newLine.split("\001");
StringBuffer append = sb.append(split[0])
.append("\t")
.append(split[1])
.append("\t")
.append(split[2])
.append("\t")
.append(split[3].substring(0,8))
.append("\t")
.append(split[3].substring(8, 10)).append(split[4]).append(split[5])
.append("\t")
.append(split[6]);
String res = append.toString();
return res.split("\t")[index];
}
public static void main(String[] args) {
UserInfoParser parser = new UserInfoParser();
String evaluate = parser.evaluate("1,zhangsan:20-1999063017:30:00-beijing",2);
System.out.println(evaluate);
}
}
②将java类打成jar包,上传到hive所在机器上
③在hive提示符下添加jar包
hive> add jar /home/hiveudf.jar;
④运行
hive> select
>parse_user_info(info,0) as uid,
>parse_user_info(info,1) as uname,
>parse_user_info(info,2) as age,
>parse_user_info(info,3) as birthday_date,
>parse_user_info(info,4) as birthday_time,
>parse_user_info(info,5) as address
>from user_info;

567

被折叠的 条评论
为什么被折叠?



