Hive的UDTF开发实例.md

最新推荐文章于 2024-10-19 19:10:01 发布

原创最新推荐文章于 2024-10-19 19:10:01 发布 · 693 阅读

0 ·

CC 4.0 BY-SA版权

本文为窦永厚原创文章，未经允许不得转载。

hive 专栏收录该内容

14 篇文章

订阅专栏

本文介绍了一个Hive UDTF（用户定义表函数）的实现案例，用于处理日期范围内的周期计数。该函数接收两个参数作为起始和结束日期，返回两列数据，一列为日期，另一列为周期计数，适用于大数据处理场景。

案例：
实现输入两个参数分别为开始日期（日期为6位数YYYYMM），输出开始时间和结束时间之间每隔12月的周期计数+1.
例如：
输入： UDT_10(‘20170201’,‘20321001’)
输出：±--------±------+
| col1 | col2 |
±--------±------+
| 201702 | 0 |
| 201703 | 0 |
| 201704 | 0 |
| 201705 | 0 |
| 201706 | 0 |
| 201707 | 0 |
| 201708 | 0 |
| 201709 | 0 |
| 201710 | 0 |
| 201711 | 0 |
| 201712 | 0 |
| 201801 | 0 |
| 201802 | 1 |
| 201803 | 1 |
| 201804 | 1 |
| 201805 | 1 |
| 201806 | 1 |
| 201807 | 1 |
| 201808 | 1 |
| 201809 | 1 |
| 201810 | 1 |
±--------±------+

package io.transwarp.udtf;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;

import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

public class DateProcess extends GenericUDTF {

	@Override
	public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
		if (args.length < 2) {
			throw new UDFArgumentTypeException();

		}
		ArrayList<String> fieldNames = new ArrayList<String>();
		ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
		fieldNames.add("col1");
		fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
		fieldNames.add("col2");
		fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
		return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);

	}

	@Override
	public void process(Object[] text) throws HiveException {
		// TODO Auto-generated method stub
		String beginTime = text[0].toString();
		String endTime = text[1].toString();
		List<String> linepre = new ArrayList<String>();
		StringBuffer rowpre = new StringBuffer();
		for (int i = 2; i < text.length; i++) {
			linepre.add(text[i].toString());
		}

		List<String> allMonthList = new ArrayList<String>();
		allMonthList = getMonthBetween(beginTime, endTime);
		List<String> row = new ArrayList<String>();
		row.addAll(linepre);//2个元素:201802 203209
		//2个元素:201802 , 203209 , :
		for (String line : allMonthList) {
			
			//2个元素:201802 , 203209 , 
//			row.add(line);
//			forward(row.toArray());
			String []str=line.split(" ");
			row.add(str[0]);//201702
			row.add(str[1]);//0
//			row.add(str[0]);//201702
//			row.add(str[1]);//0
			forward(row.toArray());
			row.remove(row.size() - 1);
			row.remove(row.size() - 1);

		}

		// }
	}

	@Override
	public void close() throws HiveException {
		// TODO Auto-generated method stub

	}

	public static void main(String[] args) throws ParseException {
		System.out.println(getMonthBetween("20170201", "20321001"));
		System.out.println(getMonthBetween("20170201", "20321001").size());
		// Object[] textObjects = { "20170801", "20180801" };
		// System.out.println(textObjects[0].toString());
		// System.out.println(textObjects[1].toString());
		// try {
		// new DateProcess().process(textObjects);
		// } catch (HiveException e) {
		// // TODO Auto-generated catch block
		// e.printStackTrace();
		// }
	}

	public static List<String> getMonthBetween(String startDate, String endDate) {

		int startYear = Integer.parseInt(startDate.substring(0, 4)); // startDate.substring(0,4);
		int startMonth = Integer.parseInt(startDate.substring(4, 6));
		int endYear = Integer.parseInt(endDate.substring(0, 4));
		int endMonth = Integer.parseInt(endDate.substring(4, 6));
		int k = -1;
		List<String> list = new ArrayList<String>();
		HashMap<String, Integer> map = new HashMap<String, Integer>();
		while (endYear > startYear || (endYear == startYear && endMonth >= startMonth)) {
			k++;

			int outNum = k / 12;
			map.put(startYear + "" + startMonth, outNum);
			if (startMonth / 10 < 1) {
				// System.out.println(startYear+"0"+startMonth+","+outNum);
				list.add(startYear + "0" + startMonth + " " + outNum);
			} else {
				// System.out.println(startYear+""+startMonth+","+outNum);
				list.add(startYear + "" + startMonth + " " + outNum);
			}

			if (startMonth == 12) {
				startYear++;
				startMonth = 1;
			} else {
				startMonth++;
			}
		}

		return list;
	}

}

create permanent function UDTF as 'io.transwarp.udtf.DateProcess' using jar 'hdfs://nameservice1/tmp/testjar/udtf_date_process.jar';
select UDTF('20170201','20321001') from system.dual;

在这里插入图片描述