hive 自定义UDF和UDTF函数解析事件

最新推荐文章于 2024-11-19 15:58:54 发布

Master_slaves

最新推荐文章于 2024-11-19 15:58:54 发布

阅读量1k

点赞数

分类专栏：数据仓库大数据文章标签： hive自定义UDF和UTDF

本文链接：https://blog.youkuaiyun.com/Master_chaoAndQi/article/details/100054838

版权

大数据同时被 2 个专栏收录

28 篇文章

订阅专栏

数据仓库

2 篇文章

订阅专栏

一数据样例

1566461617106|{
    "cm":{
        "ln":"-96.6",
        "sv":"V2.1.6",
        "os":"8.1.3",
        "g":"REX44511@gmail.com",
        "mid":"992",
        "nw":"WIFI",
        "l":"pt",
        "vc":"5",
        "hw":"750*1134",
        "ar":"MX",
        "uid":"992",
        "t":"1566442996963",
        "la":"13.9",
        "md":"HTC-8",
        "vn":"1.1.2",
        "ba":"HTC",
        "sr":"R"
    },
    "ap":"app",
    "et":[
        {
            "ett":"1566438762478",
            "en":"newsdetail",
            "kv":{
                "entry":"3",
                "goodsid":"235",
                "news_staytime":"6",
                "loading_time":"3",
                "action":"1",
                "showtype":"2",
                "category":"48",
                "type1":""
            }
        },
        {
            "ett":"1566449474310",
            "en":"ad",
            "kv":{
                "entry":"2",
                "show_style":"5",
                "action":"3",
                "detail":"",
                "source":"3",
                "behavior":"2",
                "content":"1",
                "newstype":"7"
            }
        },
        {
            "ett":"1566389641833",
            "en":"notification",
            "kv":{
                "ap_time":"1566387527480",
                "action":"1",
                "type":"2",
                "content":""
            }
        },
        {
            "ett":"1566398913612",
            "en":"active_foreground",
            "kv":{
                "access":"1",
                "push_id":"1"
            }
        },
        {
            "ett":"1566430994142",
            "en":"praise",
            "kv":{
                "target_id":6,
                "id":0,
                "type":2,
                "add_time":"1566371286636",
                "userid":4
            }
        }
    ]
}

时间戳|{
cm:{}// 公共字段
ap:{} //数据来源
et:[{}] //事件日志集合
}

二需求分析

2.1 定义UDF函数抽取JSON中对应的字段的值

UDF ：用户定义函数一进一出

package com.gc.function;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;

/**
 * Created by guochao on 201924.
 */
public class GetValueUDF extends  UDF {
    //解析事件日志 json 数据  jsonkey kety的值
    public String evaluate(String json,String jsonkey){
        String result = new String();
        if (StringUtils.isEmpty(json) || StringUtils.isEmpty(jsonkey)){
            return result;
        }
        String[] split = json.split("\\|");
        try {
            JSONObject jsonObject  = new JSONObject(split[1].trim());

           if(jsonObject.has(jsonkey)){ //事件中的其它字段
               result= jsonObject.getString(jsonkey);
           }else if("st".equals(jsonkey)){
               result= split[0];//时间字段
           }else{
               //解析公共字段的值
               JSONObject cm = jsonObject.getJSONObject("cm");//公共字段
               if (cm.has(jsonkey)) {
                   result =cm.getString(jsonkey);
               }
           }

        } catch (JSONException e) {
            e.printStackTrace();
        }
        return result;
    }

    public static void main(String[] args) {
        String key="1566461617106|{\"cm\":{\"ln\":\"-92.7\",\"sv\":\"V2.6.0\",\"os\":\"8.1.2\",\"g\":\"H8694YIR@gmail.com\",\"mid\":\"988\",\"nw\":\"3G\",\"l\":\"es\",\"vc\":\"2\",\"hw\":\"1080*1920\",\"ar\":\"MX\",\"uid\":\"988\",\"t\":\"1566432661347\",\"la\":\"-42.1\",\"md\":\"Huawei-4\",\"vn\":\"1.2.0\",\"ba\":\"Huawei\",\"sr\":\"O\"},\"ap\":\"app\",\"et\":[{\"ett\":\"1566413607222\",\"en\":\"newsdetail\",\"kv\":{\"entry\":\"1\",\"goodsid\":\"234\",\"news_staytime\":\"6\",\"loading_time\":\"0\",\"action\":\"3\",\"showtype\":\"3\",\"category\":\"56\",\"type1\":\"\"}},{\"ett\":\"1566452826270\",\"en\":\"loading\",\"kv\":{\"extend2\":\"\",\"loading_time\":\"9\",\"action\":\"3\",\"extend1\":\"\",\"type\":\"2\",\"type1\":\"\",\"loading_way\":\"2\"}},{\"ett\":\"1566367597434\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1566424725932\",\"action\":\"4\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1566370724555\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"3\"}},{\"ett\":\"1566413177347\",\"en\":\"comment\",\"kv\":{\"p_comment_id\":1,\"addtime\":\"1566460762369\",\"praise_count\":401,\"other_id\":7,\"comment_id\":3,\"reply_count\":87,\"userid\":5,\"content\":\"遗甜港蹦辐铣\"}},{\"ett\":\"1566369789031\",\"en\":\"praise\",\"kv\":{\"target_id\":1,\"id\":7,\"type\":4,\"add_time\":\"1566440220563\",\"userid\":6}}]}";
        String ln = new GetValueUDF().evaluate(key, "et");
        System.out.println("ln = " + ln);
    }
}

注意：方法名必须为evaluate 且必须有返回值，可以返回NUll，可在UDF的类中查看到 UDF—>构造方法---->DefaultUDFMethodResolver–>下面的这段代码

2.2 定义UDTF函数取出event事件集合中的每个事件

package com.gc.function;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import java.util.ArrayList;
import java.util.List;

/**
 * Created by guochao on 2019/8/24.
 */
public class GetValueUDTF extends GenericUDTF {
    //处理对应的逻辑
    // 传入的数据类型为[[]]数组
    @Override
    public void process(Object[] objects) {
    //可以传入多个  在此处是一进多出 所以只取第一个
        if (objects!=null && objects.length!=0) {
            try {
                JSONArray array = new JSONArray(objects[0].toString());
                String [] result=null;
                for (int i=0;i<array.length();i++){

                    JSONObject jsonObject = array.getJSONObject(i);
                    // 获取到每个事件的事件类型
                    if (null!=jsonObject && jsonObject.has("en")) {
                        String event_name = jsonObject.getString("en");
                        result= new String[]{event_name,array.getString(i)};
                        try {
                            System.out.println("event_name = " + event_name);
                            System.out.println("jsonObject = " + jsonObject);
                            forward(result);
                        } catch (HiveException e) {
                            System.out.println(e.getMessage());
                           continue;
                        }
                    }
                }
            } catch (JSONException e) {
                System.out.println(e.getMessage());
                e.printStackTrace();
            }
        }
    }
    @Override
    public void close()  {

    }
    //初始化 设指定输出参数的名称和输出的类型
    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        List<String> list = new ArrayList<String>();
        List<ObjectInspector> fileds = new ArrayList<ObjectInspector>();
        list.add("event_name");
        fileds.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        list.add("event_json");
        fileds.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        return ObjectInspectorFactory.getStandardStructObjectInspector(list, fileds);
    }

//    public static void main(String[] args) {
//        new GetValueUDTF().process(new Object[]{"[{\"ett\":\"1566413607222\",\"en\":\"newsdetail\",\"kv\":{\"entry\":\"1\",\"goodsid\":\"234\",\"news_staytime\":\"6\",\"loading_time\":\"0\",\"action\":\"3\",\"showtype\":\"3\",\"category\":\"56\",\"type1\":\"\"}},{\"ett\":\"1566452826270\",\"en\":\"loading\",\"kv\":{\"extend2\":\"\",\"loading_time\":\"9\",\"action\":\"3\",\"extend1\":\"\",\"type\":\"2\",\"type1\":\"\",\"loading_way\":\"2\"}},{\"ett\":\"1566367597434\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1566424725932\",\"action\":\"4\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1566370724555\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"3\"}},{\"ett\":\"1566413177347\",\"en\":\"comment\",\"kv\":{\"p_comment_id\":1,\"addtime\":\"1566460762369\",\"praise_count\":401,\"other_id\":7,\"comment_id\":3,\"reply_count\":87,\"userid\":5,\"content\":\"遗甜港蹦辐铣\"}},{\"ett\":\"1566369789031\",\"en\":\"praise\",\"kv\":{\"target_id\":1,\"id\":7,\"type\":4,\"add_time\":\"1566440220563\",\"userid\":6}}]"});
//    }
}

注意：还需要实现initialize方法，在此方法中完成输出结果的类型和名称定义

三函数测试

3.1 打包项目上传到HDFS上

 hadoop fs -mkdir /user/hive/jars
 hadoop fs -put /opt/module/hive1.2.1/lib/hive-function.jar /user/hive/jars

3.2 创建指定的函数

3.2.1 启动hdfs，yarn

hadoop 群起脚本

hadoop-shell start

3.2.2 启动hive

3.2.3创建函数

create [temporary] function [dbname.]function_name AS class_name;

// 处理公共字段的函数
create function gmall.base_analizer as 'com.gc.function.GetValueUDF' using jar 'hdfs://hadoop102:9000/user/hive/jars/hive-function.jar';
// 处理event 数组的函数
create function gmall.event_analizer as 'com.gc.function.GetValueUDTF' using jar 'hdfs://hadoop102:9000/user/hive/jars/hive-function.jar';

3.3 测试函数

select base_analizer(line,"sv") as t ,
event_name,
event_json
from gmall.ods_event_log lateral view event_analizer(base_analizer(line,"et")) tmp as  event_name,event_json limit 3;

测试运行结果如下

+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+
|    t    |     event_name     |                                                                     event_json                                                                      |
+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+
| V2.0.3  | display            | {"ett":"1566522643775","en":"display","kv":{"goodsid":"0","action":"1","extend1":"1","place":"0","category":"58"}}                                  |
| V2.0.3  | loading            | {"ett":"1566496875003","en":"loading","kv":{"extend2":"","loading_time":"2","action":"3","extend1":"","type":"3","type1":"433","loading_way":"1"}}  |
| V2.0.3  | active_foreground  | {"ett":"1566462987798","en":"active_foreground","kv":{"access":"1","push_id":"3"}}                                                                  |
+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+

总结：