分组查找出现最多的元素hive自定义UDAF

本文详细介绍了如何使用Hive UDAF(User Defined Aggregate Function)实现计算数据集中出现次数最多的项的功能,包括代码实现、参数设置及注意事项。

import org.apache.hadoop.hive.ql.udf.generic.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.hive.serde2.objectinspector.*;

import java.util.Map;
import java.util.HashMap;

@Description(name = "mostOccrItem", value = "_FUNC_(x) - Returns a  object that occures most. "
		+ "CAUTION will easily cause Out Of Memmory Exception on large data sets")
/**
 * 
 * @author houzhizhen
 * create temporary function mostOccrItem as com.letv.bigdata.hive.udaf.MostOccuItem
 * 
   *
 
  public static enum Mode {
   
     * PARTIAL1: from original data to partial aggregation data: iterate() and
     * terminatePartial() will be called.
    
    PARTIAL1,
       
     * PARTIAL2: from partial aggregation data to partial aggregation data:
     * merge() and terminatePartial() will be called.
     
    PARTIAL2,
        
     * FINAL: from partial aggregation to full aggregation: merge() and
     * terminate() will be called.
   
    FINAL,
       
     * COMPLETE: from original data directly to full aggregation: iterate() and
     * terminate() will be called.
    
    COMPLETE
  };

 */
public class MostOccuItem extends AbstractGenericUDAFResolver {
	static final Log LOG = LogFactory
			.getLog(MostOccuItem.class.getName());

	public MostOccuItem() {
	}

	@Override
	public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
			throws SemanticException {
		if (parameters.length != 1) {
			throw new UDFArgumentTypeException(parameters.length - 1,
					"Exactly one argument is expected.");
		}
		if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
			throw new UDFArgumentTypeException(0,
					"Only primitive type arguments are accepted but "
							+ parameters[0].getTypeName()
							+ " was passed as parameter 1.");
		}
		return new GenericUDAFMkListEvaluator();
	}

	public static class GenericUDAFMkListEvaluator extends GenericUDAFEvaluator {
	//	private PrimitiveObjectInspector inputOI;

		private StandardMapObjectInspector mapOI ;

		@Override
		public ObjectInspector init(Mode m, ObjectInspector[] parameters)
				throws HiveException {
			super.init(m, parameters);
			if (m == Mode.PARTIAL1 ) {
				//inputOI = (PrimitiveObjectInspector) parameters[0];
				return ObjectInspectorFactory
						.getStandardMapObjectInspector((PrimitiveObjectInspector) ObjectInspectorUtils
								.getStandardObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector),PrimitiveObjectInspectorFactory.javaIntObjectInspector);
			}else if (m == Mode.PARTIAL2) {
				mapOI = (StandardMapObjectInspector) parameters[0];
				return ObjectInspectorFactory
						.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector,PrimitiveObjectInspectorFactory.javaIntObjectInspector);
			}else if(m == Mode.FINAL){
				mapOI = (StandardMapObjectInspector) parameters[0];
				 return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
			}else if(m == Mode.COMPLETE){
				return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
			}else {
				throw new RuntimeException("no such mode Exception");
			}
		}

		static class MkArrayAggregationBuffer implements AggregationBuffer {
			Map<String,Integer> container;
		}

		@Override
		public void reset(AggregationBuffer agg) throws HiveException {
			((MkArrayAggregationBuffer) agg).container = new HashMap<String,Integer>();
		}

		@Override
		public AggregationBuffer getNewAggregationBuffer() throws HiveException {
			MkArrayAggregationBuffer ret = new MkArrayAggregationBuffer();
			reset(ret);
			return ret;
		}

		// Mapside
		@Override
		public void iterate(AggregationBuffer agg, Object[] parameters)
				throws HiveException {
			assert (parameters.length == 1);
			Object p = parameters[0];

			if (p != null) {
				MkArrayAggregationBuffer myagg = (MkArrayAggregationBuffer) agg;
				putIntoMap(p.toString(), myagg, 1);
			}
		}

		// Mapside
		@Override
		public Object terminatePartial(AggregationBuffer agg)
				throws HiveException {
			MkArrayAggregationBuffer myagg = (MkArrayAggregationBuffer) agg;
			Map<String,Integer> ret = new HashMap<String,Integer>(
					myagg.container);
			
			return ret;
		}

		@Override
		public void merge(AggregationBuffer agg, Object partial)
				throws HiveException {
			MkArrayAggregationBuffer myagg = (MkArrayAggregationBuffer) agg;
			
			Map partialResult = mapOI.getMap(partial) ;
			
			for(Object key : partialResult.keySet()){
				
				putIntoMap(key.toString(),myagg,Integer.valueOf(partialResult.get(key).toString()) );
			}
		}

		@Override
		public String terminate(AggregationBuffer agg) throws HiveException {
			Map<Object,Integer> map = new HashMap<Object,Integer>();
			MkArrayAggregationBuffer myagg = (MkArrayAggregationBuffer) agg;
			int num = Integer.MIN_VALUE;
			String key = null;
			for(Map.Entry<String,Integer> entry :myagg.container.entrySet()){
				if(num < entry.getValue()) {
					num = entry.getValue();
					key = entry.getKey();
				}
			}
			return key;
		}
		
		private void putIntoMap(String p, MkArrayAggregationBuffer myagg,int num) {
		//	Object pCopy = ObjectInspectorUtils.copyToStandardObject(p,this.inputOI);
			Integer i = myagg.container.get(p);
			if(i==null) {
				i = num;
			}else{
				i = i + num;
			}
			myagg.container.put(p, i);
		}
	}
}
### 关于头歌平台中的Hive自定义答案相关内容 在头歌平台上查找关于Hive自定义答案的内容,可以关注以下几个方面: #### 1. **Hive自定义函数(UDF/UDAF/UDTF)** Hive支持用户通过编写Java代码来实现自定义函数,这些函数分为三类:User Defined Function (UDF),User Defined Aggregation Function (UDAF),以及 User Defined Table Generating Function (UDTF)[^1]。 - UDF用于单行输入并返回单行输出的操作。 - UDAF用于聚合操作,例如求和、平均值等。 - UDTF则允许将一行输入转换为多行输出。 以下是创建简单UDF的一个例子: ```java import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; public class LowerCase extends UDF { public Text evaluate(Text str) { if (str == null) { return null; } return new Text(str.toString().toLowerCase()); } } ``` 编译完成后,需将其打包成jar文件,并加载到Hive环境中[^2]: ```sql ADD JAR /path/to/lowercase.jar; CREATE TEMPORARY FUNCTION lower_case AS 'LowerCase'; SELECT lower_case(column_name) FROM table_name; ``` --- #### 2. **Hive配置自定义** 除了功能上的扩展外,还可以通过修改`hive-site.xml`或其他环境变量来自定义Hive的行为。例如,在`hive-env.sh`中设置JVM内存分配或指定外部依赖路径[^2]: ```bash export HADOOP_HEAPSIZE=2048 export HIVE_AUX_JARS_PATH=/custom/path/to/libs ``` 如果需要动态调整某些运行时参数,则可以通过`SET`命令完成: ```sql SET hive.exec.dynamic.partition=true; -- 开启动态分区 SET mapreduce.job.reduces=10; -- 设置Reduce任务数量 ``` --- #### 3. **头歌平台资源定位** 针对头歌平台的具体教程或文档,建议按照以下步骤操作: - 登录头歌教育平台后进入课程管理区域; - 利用内置搜索引擎关键词如“Hive 自定义函数”或者“Hive 高级应用”检索相关实验项目; - 如果未找到现成资料,可尝试联系教学管理员请求补充此类专题内容。 此外,部分高级特性可能涉及分布式计算框架集成(比如Spark SQL),这同样属于广义范围内的定制范畴之一[^1]。 ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值