Hive的UDTF开发实例.md

本文介绍了一个Hive UDTF(用户定义表函数)的实现案例,用于处理日期范围内的周期计数。该函数接收两个参数作为起始和结束日期,返回两列数据,一列为日期,另一列为周期计数,适用于大数据处理场景。

案例:
实现输入两个参数分别为开始日期(日期为6位数YYYYMM),输出开始时间和结束时间之间每隔12月的周期计数+1.

例如:
输入: UDT_10(‘20170201’,‘20321001’)
输出:±--------±------+
| col1 | col2 |
±--------±------+
| 201702 | 0 |
| 201703 | 0 |
| 201704 | 0 |
| 201705 | 0 |
| 201706 | 0 |
| 201707 | 0 |
| 201708 | 0 |
| 201709 | 0 |
| 201710 | 0 |
| 201711 | 0 |
| 201712 | 0 |
| 201801 | 0 |
| 201802 | 1 |
| 201803 | 1 |
| 201804 | 1 |
| 201805 | 1 |
| 201806 | 1 |
| 201807 | 1 |
| 201808 | 1 |
| 201809 | 1 |
| 201810 | 1 |
±--------±------+

package io.transwarp.udtf;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;

import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

public class DateProcess extends GenericUDTF {

	@Override
	public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
		if (args.length < 2) {
			throw new UDFArgumentTypeException();

		}
		ArrayList<String> fieldNames = new ArrayList<String>();
		ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
		fieldNames.add("col1");
		fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
		fieldNames.add("col2");
		fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
		return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);

	}

	@Override
	public void process(Object[] text) throws HiveException {
		// TODO Auto-generated method stub
		String beginTime = text[0].toString();
		String endTime = text[1].toString();
		List<String> linepre = new ArrayList<String>();
		StringBuffer rowpre = new StringBuffer();
		for (int i = 2; i < text.length; i++) {
			linepre.add(text[i].toString());
		}

		List<String> allMonthList = new ArrayList<String>();
		allMonthList = getMonthBetween(beginTime, endTime);
		List<String> row = new ArrayList<String>();
		row.addAll(linepre);//2个元素:201802 203209
		//2个元素:201802 , 203209 , :
		for (String line : allMonthList) {
			
			//2个元素:201802 , 203209 , 
//			row.add(line);
//			forward(row.toArray());
			String []str=line.split(" ");
			row.add(str[0]);//201702
			row.add(str[1]);//0
//			row.add(str[0]);//201702
//			row.add(str[1]);//0
			forward(row.toArray());
			row.remove(row.size() - 1);
			row.remove(row.size() - 1);

		}

		// }
	}

	@Override
	public void close() throws HiveException {
		// TODO Auto-generated method stub

	}

	public static void main(String[] args) throws ParseException {
		System.out.println(getMonthBetween("20170201", "20321001"));
		System.out.println(getMonthBetween("20170201", "20321001").size());
		// Object[] textObjects = { "20170801", "20180801" };
		// System.out.println(textObjects[0].toString());
		// System.out.println(textObjects[1].toString());
		// try {
		// new DateProcess().process(textObjects);
		// } catch (HiveException e) {
		// // TODO Auto-generated catch block
		// e.printStackTrace();
		// }
	}

	public static List<String> getMonthBetween(String startDate, String endDate) {

		int startYear = Integer.parseInt(startDate.substring(0, 4)); // startDate.substring(0,4);
		int startMonth = Integer.parseInt(startDate.substring(4, 6));
		int endYear = Integer.parseInt(endDate.substring(0, 4));
		int endMonth = Integer.parseInt(endDate.substring(4, 6));
		int k = -1;
		List<String> list = new ArrayList<String>();
		HashMap<String, Integer> map = new HashMap<String, Integer>();
		while (endYear > startYear || (endYear == startYear && endMonth >= startMonth)) {
			k++;

			int outNum = k / 12;
			map.put(startYear + "" + startMonth, outNum);
			if (startMonth / 10 < 1) {
				// System.out.println(startYear+"0"+startMonth+","+outNum);
				list.add(startYear + "0" + startMonth + " " + outNum);
			} else {
				// System.out.println(startYear+""+startMonth+","+outNum);
				list.add(startYear + "" + startMonth + " " + outNum);
			}

			if (startMonth == 12) {
				startYear++;
				startMonth = 1;
			} else {
				startMonth++;
			}
		}

		return list;
	}

}

create permanent function UDTF as 'io.transwarp.udtf.DateProcess' using jar 'hdfs://nameservice1/tmp/testjar/udtf_date_process.jar';
select UDTF('20170201','20321001') from system.dual;

在这里插入图片描述

Hive UDTF(User-Defined Table-Generating Function)是一种自定义函数,可以用于生成表格数据。下面是编写Hive UDTF的基本步骤: 1. 继承Hive UDTF类(org.apache.hadoop.hive.ql.udtf.generic.GenericUDTF)。 2. 实现一个或多个方法,例如initialize()、process()和close()。 3. 在process()方法中生成输出数据并使用forward()方法将其发送到Hive中。 4. 定义输入参数和输出列的元数据。可以使用@UDFType、@UDF和@Description注解来指定元数据。 5.UDTF打包成JAR文件并将其添加到Hive的CLASSPATH中。 6.Hive中创建函数并使用它。 下面是一个示例UDTF,它将输入字符串拆分为单词并将每个单词输出为一行: ``` import org.apache.hadoop.hive.ql.udtf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import java.util.ArrayList; public class SplitUDTF extends GenericUDTF { private final ArrayList<Object[]> output = new ArrayList<>(); @Override public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { if (args.length != 1) { throw new UDFArgumentException("SplitUDTF takes exactly one argument"); } if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE || !args[0].getTypeName().equals("string")) { throw new UDFArgumentException("SplitUDTF takes a string as its argument"); } final StandardListObjectInspector outputOI = ObjectInspectorFactory.getStandardListObjectInspector( PrimitiveObjectInspectorFactory.javaStringObjectInspector); return ObjectInspectorFactory.getStandardStructObjectInspector( new ArrayList<String>() {{ add("word"); }}, new ArrayList<ObjectInspector>() {{ add(outputOI); }}); } @Override public void process(Object[] args) throws HiveException { final String input = args[0].toString(); final String[] words = input.split("\\s+"); for (final String word: words) { output.add(new Object[] { word }); } } @Override public void close() throws HiveException { for (final Object[] row: output) { forward(row); } } } ``` 使用@UDFType、@UDF和@Description注解指定元数据: ``` @UDFType(deterministic = true) @UDF( name = "split", description = "Splits a string into words", returnType = "array<string>", extended = "Example: SELECT split('hello world') AS words FROM table") public class SplitUDTF extends GenericUDTF { ... } ``` 在Hive中创建函数并使用它: ``` ADD JAR /path/to/split-udtf.jar; CREATE FUNCTION split AS 'SplitUDTF'; SELECT split('hello world') AS words; ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值