hive02

最新推荐文章于 2023-11-15 16:49:41 发布

原创最新推荐文章于 2023-11-15 16:49:41 发布 · 653 阅读

CC 4.0 BY-SA版权

本文介绍了Hive的数据操作，包括添加和修改列、设置本地运行模式、清空表数据、插入数据及多重插入。此外，还讲解了内连接、日期函数、Json解析、网址解析等，提供了提高数据处理效率的方法。

添加列
alter table 表名 add columns(address string,age int);
全部替换
alter table 表名 reqlace columns(表结构）
修改已存在的列定义
alter table 表名 change id uid string;
设置本地运行hive的mapreduce，不提交给yarn
hive>set hive.exec.mode.local.auto=true;
清空表数据，保留表结构
hive> truncate table test_4_st_200;
插入单条数据
insert into table 表名values(‘10’,‘xx’,‘beijing’,28);
9.2 多重插入
假如有一个需求：
从test_4中筛选出不同的数据，插入另外两张表中；
hive> create table test_4_st_200 like test_4;
hive> alter table test_4_st_200 add partition(condition=‘lt200’);
FAILED: ValidationFailureSemanticException Partition spec {condition=lt200} contains non-partition columns
如果添加分区只能是day
hive> alter table test_4_st_200 add partition(day=‘lt200’);

hive> insert into table test_4_st_200 partition(day=‘lt200’)
select ip,url,staylong from test_4 where staylong<200;
hive> select * from test_4_st_200;
我们将staylong小于200的数据添加到test_4_st_200 ，day=‘lt200’，这分区中
我们再将staylong大于200的数据添加到test_4_st_200 ，day=‘gt200’，这分区中，如下：

hive> insert into table test_4_st_200 partition(day=‘gt200’)
select ip,url,staylong from test_4 where staylong>200;
hive> select * from test_4_st_200;
但是以上实现方式有一个弊端，两次筛选job，要分别启动两次mr过程，要对同一份源表数据进行两次读取
如果使用多重插入语法，则可以避免上述弊端，提高效率：源表只要读取一次即可
hive> from test_4
insert into table test_4_st_200 partition(day=‘lt200’)
select ip,url,staylong where staylong<200
insert into table test_4_st_200 partition(day=‘gt200’)
select ip,url,staylong where staylong>200;

hive> select * from test_4_st_200;

内连接

在这里插入图片描述
笛卡尔积
把所有可能都输出出来
SELECT * from t_a a JOIN t_b b ;

在hive中显示字段名
hive> set hive.cli.print.header=true;
hive> set hive.resultset.use.unique.column.names=false;
日期函数
hive> select current_date from dual;
hive> select current_timestamp from dual;
hive> select unix_timestamp() from dual;
hive> select unix_timestamp(‘2019-05-07 13:01:03’) from dual;
hive> select unix_timestamp(‘20190507 13:01:03’,‘yyyyMMdd HH:mm:ss’) from dual;
hive> select from_unixtime(1557205263,‘yyyy-MM-dd HH:mm:ss’) from dual;
获取日期、时间
hive> select year(‘2011-12-08 10:03:01’) from dual;
hive> select year(‘2012-12-08’) from dual;
select month(‘2011-12-08 10:03:01’) from dual;
12
select month(‘2011-08-08’) from dual;
8
select day(‘2011-12-08 10:03:01’) from dual;
8
select day(‘2011-12-24’) from dual;
24
select hour(‘2011-12-08 10:03:01’) from dual;
10
select minute(‘2011-12-08 10:03:01’) from dual;
3
select second(‘2011-12-08 10:03:01’) from dual;
1

日期增减

select date_add(‘2012-12-08’,10) from dual;
2012-12-18

date_sub (string startdate, int days) : string
例：
select date_sub(‘2012-12-08’,10) from dual;
2012-11-28

Json函数解析

电影topn
将数据rating.json上传到hdp03的/home下
在hive中先创建一张不表，将一行的json看做一个字段
hive> create table t_rate_json(line string) row format delimited;
导入数据
hive> load data local inpath ‘/home/rating.json’ into table t_rate_json;
创建一张表，存储解析后的数据
hive> create table t_rate(movie string,rate int,ts string,uid string) row format delimited fields terminated by ‘\001’;
解析json函数使用get_json_object函数
测试：
hive> select get_json_object(line," $movie") from t_rate_json limit 2; hive> insert into table t_rate select get_json_object(line,'$ .movie’),
get_json_object(line,’ $rate'), get_json_object(line,'$ .timeStamp’),
get_json_object(line,’$.uid’)
from t_rate_json;
hive> select * from t_rate limit 10;
统计评分大于3的所有评分记录
hive> select * from t_rate where rate > 3;
统计每个人评分的总分数
hive> select uid,sum(rate) from t_rate group by uid;
统计每个人评分的总分数倒序排
hive> select uid,sum(rate) rate_sum from t_rate group by uid order by rate_sum desc;
统计每个人评分的总分数倒序排，前10个；
hive> select uid,sum(rate) rate_sum from t_rate group by uid order by rate_sum desc limit 10;
另外一种json解析的方法：
测试：
hive> select
json_tuple(line,“movie”,“rate”,“timeStamp”,“uid”)
as(movie,rate,ts,uid)
from t_rate_json
limit 10;
hive> create table t_rate_a
as
select uid,movie,rate,year(from_unixtime(cast(ts as bigint))) as year,month(from_unixtime(cast(ts as bigint))) as month,day(from_unixtime(cast(ts as bigint))) as day,hour(from_unixtime(cast(ts as bigint))) as hour,
minute(from_unixtime(cast(ts as bigint))) as minute,from_unixtime(cast(ts as bigint)) as ts
from
(select
json_tuple(line,‘movie’,‘rate’,‘timeStamp’,‘uid’) as(movie,rate,ts,uid)
from t_rate_json) tmp;
分组topn
hive> select *,row_number() over(partition by uid order by rate desc) as rank from t_rate;
hive> select uid,movie,rate,ts
from
(select uid,movie,rate,ts,row_number() over(partition by uid order by rate desc) as rank from t_rate) tmp
where rank<=3;

网址解析

*例如有网址：http://www.baidu.com/find?cookieid=
解析成：www.baidu.com /find cookieid 4234234234
测试
hive> select parse_url_tuple(“http://www.baidu.com/find?cookieid=4234234234”,‘HOST’,‘PATH’,‘QUERY’,‘QUERY:cookieid’)
from dual;
explode 和 lateral view
vi sutdent.txt
1,zhangsan,数学:语文:英语:生物
2,lisi,数学:语文
3,wangwu,化学:计算机:java
hive> create table t_xuanxiu(uid string,name string,kc array)
row format delimited
fields terminated by ‘,’
collection items terminated by ‘:’;
加载数据：
hive> load data local inpath “/home/student.txt” into table t_xuanxiu;
hive> select uid,name,kc[0] from t_xuanxiu;
lateral view 表生成函数
hive> select uid,name,tmp.course from t_xuanxiu
lateral view explode(kc) tmp as course;
根据单词的数量倒序排序
hive> select a.word,count(1) sum
from
(select tmp. from t_juzi lateral view explode(split(line,’ ')) tmp as word) a
group by a.word
order by sum desc;