–自定义函数
有如下json数据:rating.json
{“movie”:“1193”,“rate”:“5”,“timeStamp”:“978300760”,“uid”:“1”}
{“movie”:“661”,“rate”:“3”,“timeStamp”:“978302109”,“uid”:“1”
{“movie”:“914”,“rate”:“3”,“timeStamp”:“978301968”,“uid”:“1”}
{“movie”:“3408”,“rate”:“4”,“timeStamp”:“978300275”,“uid”:“1”}
–需要导入hive中进行分析
建表映射
create table t_ratingjson(json string);
load data local inpath ‘/homoe/had/rating.json’ into table t_ratingjson;
–想把上面的原始数据变成如下形式
1193,5,978300760,1
661,3,978302109,1
–思路:如果能够定义一个json解析函数,则很方便了
–hive 中如何定义自己的函数
1,先写一个java程序 ,实现你所想要的函数功能(传入一个json字符串和一个脚标,返回一个值)
package cn.edu360.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
public class MyJsonParser extends UDF{
//重载父类中的一个方法 evaluate()
public String evaluate(String json,int index) {
String[] fields = json.split("\"");
return fields[4*index-1];
}
}
2,将java程序打成一个jar包,上传到hive所在的机器
3. 在hive命令中将jar包添加到classpath add jar /home/had/myjson.jar;
4,在hive命令中用命令创建的一个函数叫做myjson,关联你所写的这个java类 包名一定是全名
create temporary function myjson as ‘cn.edu360.hive.udf.MyJsonParser’;
5.把解析的表放到一张新的表里去
create table t_rate
as
select myjson(json,1) as movie,cast (myjson(json,2) as int) as rate,myjson(json,3) as ts
,myjson(json,4) as uid from t_ratingjson;
select * from t_rate;
±--------------±-------------±-----------±------------+
| t_rate.movie | t_rate.rate | t_rate.ts | t_rate.uid |
±--------------±-------------±-----------±------------+
| 1610 | 4 | 978245645 | 5 |
| 2058 | 1 | 978245740 | 5 |
| 3799 | 3 | 978243937 | 5 |
| 2997 | 5 | 978241556 | 5 |
| 47 | 3 | 978245334 | 5 |
| 2700 | 4 | 978243085 | 5 |
| 296 | 4 | 978244177 | 5 |
| 581 | 3 | 978244808 | 5 |
| 1617 | 3 | 978244025 | 5 |
| 728 | 4 | 978244759 | 5 |
| 299 | 3 | 978242934 | 5 |
| 3079 | 2 | 978246162 | 5 |
| 2560 | 4 | 978242977 | 5 |
| 1909 | 3 | 978246479 | 5 |
| 150 | 2 | 978245763 | 5 |
| 224 | 3 | 978245829 | 5 |
| 3728 | 2 | 978244568 | 5 |
| 229 | 3 | 978246528 | 5 |
| 6 | 2 | 978245916 | 5 |
| 3006 | 3 | 978245422 | 5 |
| 2858 | 4 | 978241390 | 5 |
| 1046 | 5 | 978244114 | 5 |
| 515 | 4 | 978245891 | 5 |
| 800 | 2 | 978244540 | 5 |
±--------------±-------------±-----------±------------+
json 解析函数:表生成函数 自动解析函数 非常的强大
create table t_rate_2
as
select json_tuple(json,‘movie’,‘rate’,‘timeStamp’,‘uid’) as(movie,rate,ts,uid)
from t_ratingjson;
±----------------±---------------±-------------±--------------+
| t_rate_2.movie | t_rate_2.rate | t_rate_2.ts | t_rate_2.uid |
±----------------±---------------±-------------±--------------+
| 1610 | 4 | 978245645 | 5 |
| 2058 | 1 | 978245740 | 5 |
| 3799 | 3 | 978243937 | 5 |
| 2997 | 5 | 978241556 | 5 |
| 47 | 3 | 978245334 | 5 |
| 2700 | 4 | 978243085 | 5 |
| 296 | 4 | 978244177 | 5 |
| 581 | 3 | 978244808 | 5 |
| 1617 | 3 | 978244025 | 5 |
| 728 | 4 | 978244759 | 5 |
| 299 | 3 | 978242934 | 5 |
| 3079 | 2 | 978246162 | 5 |
| 2560 | 4 | 978242977 | 5 |
| 1909 | 3 | 978246479 | 5 |
| 150 | 2 | 978245763 | 5 |
| 224 | 3 | 978245829 | 5 |
| 3728 | 2 | 978244568 | 5 |
| 229 | 3 | 978246528 | 5 |
| 6 | 2 | 978245916 | 5 |
| 3006 | 3 | 978245422 | 5 |
| 2858 | 4 | 978241390 | 5 |
| 1046 | 5 | 978244114 | 5 |
| 515 | 4 | 978245891 | 5 |
| 800 | 2 | 978244540 | 5 |
±----------------±---------------±-------------±--------------+