需求
统计一个字符串在另一个字符串中出现的次数,比如:"a or b or c"中出现了多少个"or"
分析
Hive内置函数中并没有可以直接使用的函数,嵌套起来比较麻烦,所以需要自定义实现
代码
package udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
/**
* @ClassName: UDFCntStr
* @Description:
* @Author: xuezhouyi
* @Version: V1.0
**/
public class UDFCntStr extends GenericUDF {
StringObjectInspector line;
StringObjectInspector word;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if (arguments.length != 2) {
throw new UDFArgumentLengthException("arrayContainsExample only takes 2 arguments: T, T");
}
/* 检查是否接收到正确的参数类型 */
ObjectInspector a = arguments[0];
ObjectInspector b = arguments[1];
if (!(a instanceof StringObjectInspector) || !(b instanceof StringObjectInspector)) {
throw new UDFArgumentException("first argument must be a string, second argument must be a string");
}
this.line = (StringObjectInspector) a;
this.word = (StringObjectInspector) b;
/* 返回类型是int,所以我们提供了正确的object inspector */
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
/* 利用object inspectors从传递的对象中得到line与word */
String line = this.line.getPrimitiveJavaObject(arguments[0].get());
String word = this.word.getPrimitiveJavaObject(arguments[1].get());
/* 递归调用统计个数 */
return fun(line, word, 0, 0);
}
@Override
public String getDisplayString(String[] children) {
return "line count word example";
}
/* 递归函数 */
private static int fun(String s1, String s2, int start, int count) {
/* 从开始位置截取 */
s1 = s1.substring(start);
/* 判断是否存在s2不存在则直接返回count */
if ((start = s1.indexOf(s2)) == -1)
return count;
/* 存在则加一,然后开始值后移s2的长度作为下一次的开始值 */
count++;
start += s2.length();
/* 递归 */
return fun(s1, s2, start, count);
}
}
Maven打包上传添加jar包
add jar /home/hadoop/tmp/hive-1.0-SNAPSHOT.jar;
创建临时函数
create temporary function UDFCntStr as 'udf.UDFCntStr';
最终效果

Hive自定义UDF统计字符串出现次数
本文介绍了如何在Hive中自定义UDF函数来统计一个字符串在另一个字符串中出现的次数,详细讲解了从代码编写、Maven打包到Hive中创建临时函数的整个过程。
1496

被折叠的 条评论
为什么被折叠?



