hive 自定义UDF和UDTF函数解析事件

一 数据样例

1566461617106|{
    "cm":{
        "ln":"-96.6",
        "sv":"V2.1.6",
        "os":"8.1.3",
        "g":"REX44511@gmail.com",
        "mid":"992",
        "nw":"WIFI",
        "l":"pt",
        "vc":"5",
        "hw":"750*1134",
        "ar":"MX",
        "uid":"992",
        "t":"1566442996963",
        "la":"13.9",
        "md":"HTC-8",
        "vn":"1.1.2",
        "ba":"HTC",
        "sr":"R"
    },
    "ap":"app",
    "et":[
        {
            "ett":"1566438762478",
            "en":"newsdetail",
            "kv":{
                "entry":"3",
                "goodsid":"235",
                "news_staytime":"6",
                "loading_time":"3",
                "action":"1",
                "showtype":"2",
                "category":"48",
                "type1":""
            }
        },
        {
            "ett":"1566449474310",
            "en":"ad",
            "kv":{
                "entry":"2",
                "show_style":"5",
                "action":"3",
                "detail":"",
                "source":"3",
                "behavior":"2",
                "content":"1",
                "newstype":"7"
            }
        },
        {
            "ett":"1566389641833",
            "en":"notification",
            "kv":{
                "ap_time":"1566387527480",
                "action":"1",
                "type":"2",
                "content":""
            }
        },
        {
            "ett":"1566398913612",
            "en":"active_foreground",
            "kv":{
                "access":"1",
                "push_id":"1"
            }
        },
        {
            "ett":"1566430994142",
            "en":"praise",
            "kv":{
                "target_id":6,
                "id":0,
                "type":2,
                "add_time":"1566371286636",
                "userid":4
            }
        }
    ]
}

时间戳|{
cm:{}// 公共字段
ap:{} //数据来源
et:[{}] //事件日志集合
}

二 需求分析

2.1 定义UDF函数抽取JSON中对应的字段的值

UDF :用户定义函数一进一出

package com.gc.function;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;

/**
 * Created by guochao on 201924.
 */
public class GetValueUDF extends  UDF {
    //解析事件日志 json 数据  jsonkey kety的值
    public String evaluate(String json,String jsonkey){
        String result = new String();
        if (StringUtils.isEmpty(json) || StringUtils.isEmpty(jsonkey)){
            return result;
        }
        String[] split = json.split("\\|");
        try {
            JSONObject jsonObject  = new JSONObject(split[1].trim());

           if(jsonObject.has(jsonkey)){ //事件中的其它字段
               result= jsonObject.getString(jsonkey);
           }else if("st".equals(jsonkey)){
               result= split[0];//时间字段
           }else{
               //解析公共字段的值
               JSONObject cm = jsonObject.getJSONObject("cm");//公共字段
               if (cm.has(jsonkey)) {
                   result =cm.getString(jsonkey);
               }
           }

        } catch (JSONException e) {
            e.printStackTrace();
        }
        return result;
    }

    public static void main(String[] args) {
        String key="1566461617106|{\"cm\":{\"ln\":\"-92.7\",\"sv\":\"V2.6.0\",\"os\":\"8.1.2\",\"g\":\"H8694YIR@gmail.com\",\"mid\":\"988\",\"nw\":\"3G\",\"l\":\"es\",\"vc\":\"2\",\"hw\":\"1080*1920\",\"ar\":\"MX\",\"uid\":\"988\",\"t\":\"1566432661347\",\"la\":\"-42.1\",\"md\":\"Huawei-4\",\"vn\":\"1.2.0\",\"ba\":\"Huawei\",\"sr\":\"O\"},\"ap\":\"app\",\"et\":[{\"ett\":\"1566413607222\",\"en\":\"newsdetail\",\"kv\":{\"entry\":\"1\",\"goodsid\":\"234\",\"news_staytime\":\"6\",\"loading_time\":\"0\",\"action\":\"3\",\"showtype\":\"3\",\"category\":\"56\",\"type1\":\"\"}},{\"ett\":\"1566452826270\",\"en\":\"loading\",\"kv\":{\"extend2\":\"\",\"loading_time\":\"9\",\"action\":\"3\",\"extend1\":\"\",\"type\":\"2\",\"type1\":\"\",\"loading_way\":\"2\"}},{\"ett\":\"1566367597434\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1566424725932\",\"action\":\"4\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1566370724555\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"3\"}},{\"ett\":\"1566413177347\",\"en\":\"comment\",\"kv\":{\"p_comment_id\":1,\"addtime\":\"1566460762369\",\"praise_count\":401,\"other_id\":7,\"comment_id\":3,\"reply_count\":87,\"userid\":5,\"content\":\"遗甜港蹦辐铣\"}},{\"ett\":\"1566369789031\",\"en\":\"praise\",\"kv\":{\"target_id\":1,\"id\":7,\"type\":4,\"add_time\":\"1566440220563\",\"userid\":6}}]}";
        String ln = new GetValueUDF().evaluate(key, "et");
        System.out.println("ln = " + ln);
    }
}

注意: 方法名必须为evaluate 且必须有返回值,可以返回NUll,可在UDF的类中查看到 UDF—>构造方法---->DefaultUDFMethodResolver–>下面的这段代码

2.2 定义UDTF函数取出event事件集合中的每个事件

package com.gc.function;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import java.util.ArrayList;
import java.util.List;

/**
 * Created by guochao on 2019/8/24.
 */
public class GetValueUDTF extends GenericUDTF {
    //处理对应的逻辑
    // 传入的数据类型为[[]]数组
    @Override
    public void process(Object[] objects) {
    //可以传入多个  在此处是一进多出 所以只取第一个
        if (objects!=null && objects.length!=0) {
            try {
                JSONArray array = new JSONArray(objects[0].toString());
                String [] result=null;
                for (int i=0;i<array.length();i++){

                    JSONObject jsonObject = array.getJSONObject(i);
                    // 获取到每个事件的事件类型
                    if (null!=jsonObject && jsonObject.has("en")) {
                        String event_name = jsonObject.getString("en");
                        result= new String[]{event_name,array.getString(i)};
                        try {
                            System.out.println("event_name = " + event_name);
                            System.out.println("jsonObject = " + jsonObject);
                            forward(result);
                        } catch (HiveException e) {
                            System.out.println(e.getMessage());
                           continue;
                        }
                    }
                }
            } catch (JSONException e) {
                System.out.println(e.getMessage());
                e.printStackTrace();
            }
        }
    }
    @Override
    public void close()  {

    }
    //初始化 设指定输出参数的名称和输出的类型
    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        List<String> list = new ArrayList<String>();
        List<ObjectInspector> fileds = new ArrayList<ObjectInspector>();
        list.add("event_name");
        fileds.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        list.add("event_json");
        fileds.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        return ObjectInspectorFactory.getStandardStructObjectInspector(list, fileds);
    }

//    public static void main(String[] args) {
//        new GetValueUDTF().process(new Object[]{"[{\"ett\":\"1566413607222\",\"en\":\"newsdetail\",\"kv\":{\"entry\":\"1\",\"goodsid\":\"234\",\"news_staytime\":\"6\",\"loading_time\":\"0\",\"action\":\"3\",\"showtype\":\"3\",\"category\":\"56\",\"type1\":\"\"}},{\"ett\":\"1566452826270\",\"en\":\"loading\",\"kv\":{\"extend2\":\"\",\"loading_time\":\"9\",\"action\":\"3\",\"extend1\":\"\",\"type\":\"2\",\"type1\":\"\",\"loading_way\":\"2\"}},{\"ett\":\"1566367597434\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1566424725932\",\"action\":\"4\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1566370724555\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"3\"}},{\"ett\":\"1566413177347\",\"en\":\"comment\",\"kv\":{\"p_comment_id\":1,\"addtime\":\"1566460762369\",\"praise_count\":401,\"other_id\":7,\"comment_id\":3,\"reply_count\":87,\"userid\":5,\"content\":\"遗甜港蹦辐铣\"}},{\"ett\":\"1566369789031\",\"en\":\"praise\",\"kv\":{\"target_id\":1,\"id\":7,\"type\":4,\"add_time\":\"1566440220563\",\"userid\":6}}]"});
//    }
}

注意:还需要实现initialize方法,在此方法中完成输出结果的类型和名称定义

三 函数测试

3.1 打包项目上传到HDFS上

 hadoop fs -mkdir /user/hive/jars
 hadoop fs -put /opt/module/hive1.2.1/lib/hive-function.jar /user/hive/jars

3.2 创建指定的函数

3.2.1 启动hdfs,yarn

hadoop 群起脚本

hadoop-shell start

3.2.2 启动hive

3.2.3创建函数

create [temporary] function [dbname.]function_name AS class_name;

// 处理公共字段的函数
create function gmall.base_analizer as 'com.gc.function.GetValueUDF' using jar 'hdfs://hadoop102:9000/user/hive/jars/hive-function.jar';
// 处理event 数组的函数
create function gmall.event_analizer as 'com.gc.function.GetValueUDTF' using jar 'hdfs://hadoop102:9000/user/hive/jars/hive-function.jar';

3.3 测试函数

select base_analizer(line,"sv") as t ,
event_name,
event_json
from gmall.ods_event_log lateral view event_analizer(base_analizer(line,"et")) tmp as  event_name,event_json limit 3;

测试运行结果如下

+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+
|    t    |     event_name     |                                                                     event_json                                                                      |
+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+
| V2.0.3  | display            | {"ett":"1566522643775","en":"display","kv":{"goodsid":"0","action":"1","extend1":"1","place":"0","category":"58"}}                                  |
| V2.0.3  | loading            | {"ett":"1566496875003","en":"loading","kv":{"extend2":"","loading_time":"2","action":"3","extend1":"","type":"3","type1":"433","loading_way":"1"}}  |
| V2.0.3  | active_foreground  | {"ett":"1566462987798","en":"active_foreground","kv":{"access":"1","push_id":"3"}}                                                                  |
+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+

总结:

  • event_analizer 函数为一进多出,需配合 lateral view 一起使用
  • base_analizer 函数为一进一出
  • as “t” 无法执行,需替换成 as t
### Hive UDF、UDAF UDTF 的实际应用案例 #### 1. 用户自定义函数 (UDF) 用户自定义函数用于处理单个输入并返回单个输出。下面是一个将字符串转换为大写的例子: ```java import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; @Description(name = "to_upper", value = "Converts input string to upper case") public class ToUpperUDF extends UDF { public String evaluate(String input) { if (input == null) { return null; } return input.toUpperCase(); } } ``` 此代码实现了 `evaluate` 方法,当传入一个字符串时会将其转成大写形式[^2]。 为了使上述 UDF 可用,在 Hive 中需加载 jar 文件以及注册该函数: ```sql ADD JAR /path/to/your/jarfile.jar; CREATE TEMPORARY FUNCTION to_upper AS 'com.example.ToUpperUDF'; SELECT to_upper('hello world') FROM your_table LIMIT 1; ``` 以上命令首先添加了包含 UDF 类的 jar 文件到当前 session;接着创建了一个临时函数名为 `to_upper` 并关联至 Java 类路径;最后查询表中任意一行数据测试新函数的效果。 #### 2. 聚合函数 (UDAF) 聚合函数通常用来计算一组记录上的汇总统计量,如平均数、总等。这里展示如何编写一个简单的求 UDAF 函数: ```java package com.example.udaf; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; public final class SumEvaluator extends GenericUDAFEvaluator { private transient DoubleWritable result = new DoubleWritable(); static class Buffer implements AggregationBuffer { double sum = 0d; } @Override protected ObjectInspector initMode(ObjectInspector[] args) throws SemanticException { for (int i = 0; i < args.length; ++i) { PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; } return PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; } @Override public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { ((SumEvaluator.Buffer)agg).sum += ((Number)parameters[0]).doubleValue(); } @Override public Object terminatePartial(AggregationBuffer agg) throws HiveException { return ((SumEvaluator.Buffer)agg).sum; } @Override public void merge(AggregationBuffer agg, Object partial) throws HiveException { ((SumEvaluator.Buffer)agg).sum += ((Number)partial).doubleValue(); } @Override public Object terminate(AggregationBuffer agg) throws HiveException { this.result.set(((SumEvaluator.Buffer)agg).sum); return this.result; } } ``` 这段代码展示了怎样构建一个基本的求运算逻辑,并且遵循了 UDAF 接口的要求。需要注意的是,这只是一个简化版本的例子,实际上还需要实现更多方法来支持分布式环境下的中间结果传递等功能[^3]。 同样地,在使用之前也需要先引入相应的 jar 包并将这个新的聚集函数映射给 Hive SQL 关键字: ```sql ADD JAR /path/to/your/jarfile.jar; CREATE AGGREGATE FUNCTION my_sum AS 'com.example.udaf.SumEvaluator'; SELECT my_sum(column_name) FROM table_name GROUP BY another_column; ``` #### 3. 表生成函数 (UDTF) 这类特殊类型的 UDF 将单一输入拆分成多条输出行。例如,我们可以设计一个分割逗号分隔列表的功能: ```java package com.example.udtf; import java.util.StringTokenizer; import org.apache.hadoop.hive.ql.exec.UDTF; import org.apache.hadoop.io.Text; public class ExplodeCSV extends UDF { Text outText = new Text(); public boolean process(Object[] forwardArgs) throws Exception { StringTokenizer st = new StringTokenizer((String)forwardArgs[0], ","); while(st.hasMoreTokens()){ outText.set(st.nextToken()); forward(forward(outText)); } return true; } } ``` 在此基础上,可以通过如下方式调用它来进行 CSV 字符串解析操作: ```sql ADD JAR /path/to/your/jarfile.jar; CREATE TEMPORARY FUNCTION explode_csv AS 'com.example.udtf.ExplodeCSV'; WITH sample_data AS ( SELECT 'apple,banana,cherry' as fruits UNION ALL SELECT 'dog,cat,bird' ) SELECT fruit FROM sample_data LATERAL VIEW explode_csv(fruits) exploded_fruit AS fruit; ``` 这样就可以把每一项都单独提取出来作为独立的一列显示出来了。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Master_slaves

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值