之前的脚本从hive 迁移至spark-hive执行
由于业务需求需要使用UDTF处理数据
公司spark版本为2.4
udtf代码如下:
package cn.qihoo360.web.udf;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
public class ProScanLeft extends GenericUDTF {
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
List<? extends StructField> allStructFieldRefs = argOIs.getAllStructFieldRefs();
if (allStructFieldRefs.size() != 1) {
throw new UDFArgumentException("This function need one params");
}
if (!"string".equals(allStructFieldRefs.get(0).getFieldObjectInspector().getTypeName())) {
throw new UDFArgumentException("param is string");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("p0");
fieldNames.add("p1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
String srcString = args[0] + "";
String[] p = srcString.trim().split("#");
String p0 = p[0];
if (p.length >= 2) {
String strList1 = p[1];
String[] p1Arr = strList1.split("_");
List<String> list = new LinkedList<String>();
list.add(p0 + "#");
for (String str : p1Arr) {
list.add(list.get(list.size() - 1) + "_" + str);
}
List<String> list2 = new LinkedList<String>();
for (String str : list) {
list2.add(str.replace("#_", "#").replaceFirst("#$", ""));
}
for (String p1 : list2) {
String[] res = new String[2];
res[0] = p0;
res[1] = p1;
forward(res);
}
} else {
String[] res = new String[2];
res[0] = p0;
res[1] = p0;
forward(res);
}
}
@Override
public void close() throws HiveException {
// TODO Auto-generated method stub
}
}
将此代码打包到HDFS,并且已经制作function成功
执行sql报错:
Please make sure your function overrides `public StructObjectInspector initialize(ObjectInspector[] args)`.; line 6 pos 0
执行引擎为hive时,没有有报错
仔细研究发现
GenericUDTF中有两个initialize方法:
spark中用的已经过时的initialize(ObjectInspector[] argOIs),且如调用该方法会直接抛异常,所以必须实现initialize(StructObjectInspector argOIs),因此,我的方法并不会被调用。
将代码修改如下:
package cn.qihoo360.web.udf;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
public class ProScanLeft2 extends GenericUDTF {
@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
if (argOIs.length != 1) {
throw new UDFArgumentException("This function need one params");
}
// 接受第一个参数
ObjectInspector firstArg = argOIs[0];
if (!(firstArg instanceof StringObjectInspector)) {
throw new UDFArgumentException("param is string");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("p0");
fieldNames.add("p1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
String srcString = args[0] + "";
String[] p = srcString.trim().split("#");
String p0 = p[0];
if (p.length >= 2) {
String strList1 = p[1];
String[] p1Arr = strList1.split("_");
List<String> list = new LinkedList<String>();
list.add(p0 + "#");
for (String str : p1Arr) {
list.add(list.get(list.size() - 1) + "_" + str);
}
List<String> list2 = new LinkedList<String>();
for (String str : list) {
list2.add(str.replace("#_", "#").replaceFirst("#$", ""));
}
for (String p1 : list2) {
String[] res = new String[2];
res[0] = p0;
res[1] = p1;
forward(res);
}
} else {
String[] res = new String[2];
res[0] = p0;
res[1] = p0;
forward(res);
}
}
@Override
public void close() throws HiveException {
// TODO Auto-generated method stub
}
}