UDF
自定义UDF
● 继承org.apache.hadoop.hive.ql.exec.UDF类
● 必须含有一个evaluate()方法,可以重载多个,但至少有一个evaluate方法。
● 我写这个很简单,只要判断输入的数据不为空,就用java的正则表达式,把"替换为空。
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class QuoteUDF extends UDF{
public Text evaluate(Text str ){
if(str!=null){
return new Text(str.toString().replaceAll("\"",""));
}
return null;
}
public static void main(String[] args) {
System.out.println(new Text("\""+"abfcd"+"\""));
System.out.println(new LiangmanQuoteUDF().evaluate(new Text("1111111111111")));
System.out.println(new LiangmanQuoteUDF().evaluate(new Text("\""+"abfcd"+"\"")));
}
}
在hive上创建数据库和表
create database liangman_log;
use liangman_log;
create table yhd_source(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by '\t'
stored as textfile;
导入 数据(两个)
load data local inpath '/home/liangman/2015082818'into table yhd_source;
load data local inpath '/home/liangman/2015082819' into table yhd_source;
//测试数据是否导入成功
数据选择,只选择ID列
create table if not exists onedata
as
select id from yhd_source;
//查看导入了多少条数据
把编写的自定义类打成jar包,添加到hive中
add jar /opt/quote.jar(路径为绝对路径)
创建
create temporary function my_udf as 'qualified name';
//查看自定义的UDF的运行结果
select my_lm(id) from onedata limit10;