Hive中自定义UDF函数
UDF是一进一出函数
public class BaseFieldUDF extends UDF{
//line:
//1549728171111|
// {"cm":{"ln":"-40.8","sv":"V2.0.8","os":"8.0.7","g":"3XR6889M@gmail.com","mid":"1","nw":"3G","l":"es","vc":"14","hw":"1080*1920","ar":"MX","uid":"1","t":"1549663498641","la":"-3.2","md":"sumsung-10","vn":"1.3.4","ba":"Sumsung","sr":"P"},
// "ap":"gmall",
// "et":[
// {"ett":"1549660448248","en":"display","kv":{"goodsid":"0","action":"1","extend1":"2","place":"3","category":"65"}},
// {"ett":"1549706874270","en":"newsdetail","kv":{"entry":"1","goodsid":"1","news_staytime":"8","loading_time":"0","action":"3","showtype":"3","category":"3","type1":""}},
// {"ett":"1549721920683","en":"loading","kv":{"extend2":"","loading_time":"16","action":"3","extend1":"","type":"3","type1":"325","loading_way":"1"}},
// {"ett":"1549646667671","en":"active_background","kv":{"active_source":"1"}},
public String evaluate(String line,String jsonkeysString){
//jsonkeysString:'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'
String[] jsonkeys = jsonkeysString.split(",");
String[] logContents = line.split("\\|");
if(logContents.length != 2 || StringUtils.isBlank(logContents[1])){
return "";
}
StringBuilder sb = new StringBuilder();
try {
//解析jsonObject数据
JSONObject jsonObject = new JSONObject(logContents[1]);
//获取cm对应的数据
JSONObject cm = jsonObject.getJSONObject("cm");
for (int i = 0; i < jsonkeys.length; i++) {
String jsonkey = jsonkeys[i];
if(cm.has(jsonkey)){
//获取每个jsonkey对应的value值:"ln":"-40.8"
sb.append(cm.getString(jsonkey)).append("\t");
}else{
sb.append("\t");
}
}
sb.append(jsonObject.getString("et")).append("\t");
sb.append(logContents[0]);
} catch (JSONException e) {
e.printStackTrace();
}
return sb.toString();
}
}
Hive中自定义UDTF函数
从process(Object[] args)方法可以看出是多进多出的函数
public class EventJsonUDTF extends GenericUDTF {
@Deprecated
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
//定义输出参数的名称和类型
List<String> fieldNames = new ArrayList<>();
List<ObjectInspector> fieldTypes = new ArrayList<>();
fieldNames.add("event_name");
fieldTypes.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("event_json");
fieldTypes.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldTypes);
}
@Override
public void process(Object[] args) throws HiveException {
String input = args[0].toString();
if(StringUtils.isBlank(input)){
return;
}else try {
//解析jsonArray的数据
//"et":[{"ett":"1549660448248","en":"display","kv":{"goodsid":"0","action":"1","extend1":"2","place":"3","category":"65"}}]
JSONArray jsonArray = new JSONArray(input);
for (int i = 0; i < jsonArray.length(); i++) {
String[] result = new String[2];
result[0] = jsonArray.getJSONObject(i).getString("en");
result[1] = jsonArray.getString(i);
//写出result
forward(result);
}
} catch (JSONException e) {
e.printStackTrace();
}
}
@Override
public void close() throws HiveException {
}
}
1.在hive中创建临时函数
- 先添加jar包 :add jar jar包所在位置
hive (gmall)> add jar /opt/module/hive-1.2.1/hivefunction-1.0-SNAPSHOT.jar;
Added [/opt/module/hive-1.2.1/hivefunction-1.0-SNAPSHOT.jar] to class path
Added resources: [/opt/module/hive-1.2.1/hivefunction-1.0-SNAPSHOT.jar]
- 再创建临时的函数:
hive (gmall)> create temporary function base_analizer as 'com.udf.BaseFieldUDF';
OK
Time taken: 0.021 seconds
2.在hive中创建永久的函数
- 在hive-site.xml文件中添加jar包对应的位置
<property>
<name>hive.aux.jars.path</name>
<value>file:///opt/module/hive/lib/json-serde-1.3.8-jar-with-dependencies.jar,file:///opt/module/hive/lib/app_logs_hive.jar</value>
</property>
- 在hive客户端创建函数
create function getdaybegin AS 'com.hive.DayBeginUDF';
3.创建读取json格式的文件的hive表
CREATE external TABLE ext_startup_logs(userId string,appPlatform string,appId string,startTimeInMs bigint,activeTimeInMs bigint,appVersion string,city string)PARTITIONED BY (y string,m string,day string)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' STORED AS TEXTFILE;