创建hive表
drop table if exists testUDF;
create table testUDF(
id string,
field1 string,
field2 string,
name string,
rsrp string,
address string,
comm1 string,
comm2 string,
comm3 string
)
row format delimited fields terminated by ',';
准备测试数据
1000,01,01,jack1,1331,shanghai1,123,456,789
1000,01,01,jack2,1332,shanghai2,123,456,789
1000,01,01,jack3,1333,shanghai3,001,002,003
1000,01,01,jack4,-1334,shanghai4,004,005,006
1001,01,01,wade1,1221,beijing1,123,456,789
1001,01,01,wade2,1222,beijing2,123,456,789
1001,01,01,wade3,-1223,beijing3,123,456,789
1001,01,01,wade4,-1224,beijing4,123,456,789
导入数据并验证
hadoop fs -put testUdf.dat /user/hive/warehouse/testudf
select * from testudf;
# 执行列转行,collect_set函数返回的是String数组
SELECT max(id), concat_ws(',',collect_set(concat_ws(',',name,rsrp,address)))as value1 from testudf group by concat(id,comm1,comm2,comm3);
自定义UDF函数
使用冒泡排序,将collect_set列转行生成的数组进行按某个值从大到小排序再返回
比如:
String[] arr1= {"n_cgi_1,-2,shanghai1","n_cgi_2,-1,shanghai1","n_cgi_3,1,shanghai1","n_cgi_4,3,shanghai1","n_cgi_5,5,shanghai1"};
按照数组中每个元素的第二个字段(样例中的-2,-1,1,3,5)进行排序返回新数组
package cn.com.dtmobile.hiveUdf;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.hadoop.hive.ql.exec.UDF;
public class MyUDF extends UDF {
public ArrayList<String> evaluate(ArrayList<String> arraylist) {
if(null == arraylist & 0 == arraylist.size()) {
return null;
}
Object[] arr = arraylist.toArray();
String[] arr2 = new String[arr.length];
for (int i = 0; i < arr.length; i++) {
arr2[i] = arr[i].toString();
}
int i,j;
String temp;
for (i = 0; i < arr2.length; i++) {
for(j=0;j<arr2.length-1-i;j++) {
if(Integer.valueOf(arr2[j].split(",")[1]) < Integer.valueOf(arr2[j+1].split(",")[1])){
temp=arr2[j];
arr2[j]=arr2[j+1];
arr2[j+1]=temp;
}
}
}
ArrayList<String> arrayList = new ArrayList<String>(Arrays.asList(arr2));
return arrayList;
}
}
打jar包,创建hive udf
hadoop fs -put myudf.jar /user/kong/data
hive> add jar hdfs://192.168.33.104:8020/user/kong/data/myudf.jar;
//这里需要写类名全路径
hive> create temporary function based_rsrp_sort as 'cn.com.dtmobile.hiveUdf.MyUDF';
show functions;
仅用来测试,所以创建的是临时函数,只对当前session有效,关闭了当前窗口,也就没有此函数了。
验证测试
SELECT max(id), based_rsrp_sort(collect_set(concat_ws(',',name,rsrp,address)))as value1 from testudf group by concat(id,comm1,comm2,comm3);
SELECT max(id), concat_ws(',',based_rsrp_sort(collect_set(concat_ws(',',name,rsrp,address))))as value1 from testudf group by concat(id,comm1,comm2,comm3);
需要注意的地方
collect_set函数返回的虽然是String[] 但在这里不能直接使用String[]来接收,需要使用ArrayList<String> 否则识别不了报如下错误:
hive (default)> SELECT max(id), based_rsrp_sort(collect_set(concat_ws(',',name,rsrp,address)))as value1 from testudf group by concat(id,comm1,comm2,comm3);
FAILED: SemanticException [Error 10014]: Line 1:16 Wrong arguments 'address':
No matching method for class cn.com.dtmobile.hiveUdf.MyUDF with (array<string>). Possible choices: _FUNC_(struct<>)
(UDF中可以有多个evaluate方法,也就是方法重载的形式)
表示的意思是自定义的UDF中并没有发现可以接收array<string>类型参数的evaluate方法
为了验证此想法,做了如下测试:
hive (default)> SELECT based_rsrp_sort('1','2',100);
FAILED: SemanticException [Error 10014]: Line 1:7 Wrong arguments '100':
No matching method for class cn.com.dtmobile.hiveUdf.MyUDF with (string, string, int). Possible choices: _FUNC_(struct<>)
hive (default)> SELECT based_rsrp_sort('1','2',100,0.8);
FAILED: SemanticException [Error 10014]: Line 1:7 Wrong arguments '0.8':
No matching method for class cn.com.dtmobile.hiveUdf.MyUDF with (string, string, int, double). Possible choices: _FUNC_(struct<>)