spark-hive 自定义函数UDTF的坑

之前的脚本从hive 迁移至spark-hive执行
由于业务需求需要使用UDTF处理数据
公司spark版本为2.4

udtf代码如下:

package cn.qihoo360.web.udf;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

public class ProScanLeft extends GenericUDTF {

	@Override
	public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
		List<? extends StructField> allStructFieldRefs = argOIs.getAllStructFieldRefs();
		if (allStructFieldRefs.size() != 1) {
			throw new UDFArgumentException("This function need one params");
		}

		if (!"string".equals(allStructFieldRefs.get(0).getFieldObjectInspector().getTypeName())) {
			throw new UDFArgumentException("param is string");
		}

		ArrayList<String> fieldNames = new ArrayList<String>();
		ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
		fieldNames.add("p0");
		fieldNames.add("p1");
		fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
		fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
		return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
	}

	@Override
	public void process(Object[] args) throws HiveException {

		String srcString = args[0] + "";

		String[] p = srcString.trim().split("#");

		String p0 = p[0];

		if (p.length >= 2) {
			String strList1 = p[1];

			String[] p1Arr = strList1.split("_");

			List<String> list = new LinkedList<String>();
			list.add(p0 + "#");

			for (String str : p1Arr) {
				list.add(list.get(list.size() - 1) + "_" + str);
			}

			List<String> list2 = new LinkedList<String>();
			for (String str : list) {
				list2.add(str.replace("#_", "#").replaceFirst("#$", ""));
			}

			for (String p1 : list2) {
				String[] res = new String[2];
				res[0] = p0;
				res[1] = p1;

				forward(res);
			}
		} else {
			String[] res = new String[2];
			res[0] = p0;
			res[1] = p0;

			forward(res);
		}
	}

	@Override
	public void close() throws HiveException {
		// TODO Auto-generated method stub

	}
}

将此代码打包到HDFS,并且已经制作function成功
执行sql报错:

Please make sure your function overrides `public StructObjectInspector initialize(ObjectInspector[] args)`.; line 6 pos 0

执行引擎为hive时,没有有报错
仔细研究发现
GenericUDTF中有两个initialize方法:
spark中用的已经过时的initialize(ObjectInspector[] argOIs),且如调用该方法会直接抛异常,所以必须实现initialize(StructObjectInspector argOIs),因此,我的方法并不会被调用。
将代码修改如下:

package cn.qihoo360.web.udf;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;


public class ProScanLeft2 extends GenericUDTF {

	@Override
	public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
		if (argOIs.length != 1) {
			throw new UDFArgumentException("This function need one params");
		}

		// 接受第一个参数
		ObjectInspector firstArg = argOIs[0];
		if (!(firstArg instanceof StringObjectInspector)) {
			throw new UDFArgumentException("param is string");
		}

		ArrayList<String> fieldNames = new ArrayList<String>();
		ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
		fieldNames.add("p0");
		fieldNames.add("p1");
		fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
		fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
		return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
	}

	@Override
	public void process(Object[] args) throws HiveException {

		String srcString = args[0] + "";

		String[] p = srcString.trim().split("#");

		String p0 = p[0];

		if (p.length >= 2) {
			String strList1 = p[1];

			String[] p1Arr = strList1.split("_");

			List<String> list = new LinkedList<String>();
			list.add(p0 + "#");

			for (String str : p1Arr) {
				list.add(list.get(list.size() - 1) + "_" + str);
			}

			List<String> list2 = new LinkedList<String>();
			for (String str : list) {
				list2.add(str.replace("#_", "#").replaceFirst("#$", ""));
			}

			for (String p1 : list2) {
				String[] res = new String[2];
				res[0] = p0;
				res[1] = p1;

				forward(res);
			}
		} else {
			String[] res = new String[2];
			res[0] = p0;
			res[1] = p0;

			forward(res);
		}
	}

	@Override
	public void close() throws HiveException {
		// TODO Auto-generated method stub

	}
}


评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值