1、UDF(时间格式转换)
// 1、定义一个类继承UDF,然后添加一个方法,ecaluate,这个方法的参数和返回值类型和函数的输入输出一致
// 2、把项目打包成jar,然后放到hive的classpath下,或者add jar
// 3、在hive里面新建一个funcation,然后指定到我们新建的类型 create function MyDateParser as'hadoop.Hive.UDF.MyDateParser'
// 4、使用方法:select MyDateParser(time) from apache_log limit 10 ;
public class MyDateParser extends UDF{
// hive自定义函数,继承UDF类之后,还需要定义一个evaluate方法,
// 这个方法的函数和hive函数接收放入参数个数和数据类型一致
// 方法的返回值和hive函数的返回值类型一样、
// 这里接收 [29/April/2016:17:38:20 +0800]
// 返回结果 :2016-4-20 20:40:39
public String evaluate(String s) {
SimpleDateFormat sdf = new SimpleDateFormat("dd/MMMM/yyyy:HH:mm:ss Z",Locale.ENGLISH) ;
if(s.indexOf("[")>-1){
s = s.replace("[", "") ;
}
if(s.indexOf("]")>-1){
s= s.replace("]", "") ;
}
try {
// 将输入的String转换为date数据类型
Date date = sdf.parse(s) ;
SimpleDateFormat rsdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") ;
return rsdf.format(date) ;
} catch (ParseException e) {
e.printStackTrace();
return "" ;
}
}
}
2、UDTF(对Apache Access log 的url解析)
public class MyUdtf extends GenericUDTF {
@Override
public void process(Object[] args) throws HiveException {
String input = args[0].toString() ;
input = input.replace("\"", "");
String[] result = input.split(" ") ;
// 如果解析错误或者失败,则返回三个字段都是"--"
if(result.length!=3){
result[0] ="--" ;
result[1] ="--" ;
result[2] ="--" ;
}else{
// 用于打印输出
forward(result) ;
}
}
@Override
public void close() throws HiveException {
// 用于关闭资源
}
@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs)
throws UDFArgumentException {
if(argOIs.length!=1){
throw new UDFArgumentException("参数不正确") ;
}
ArrayList fileName = new ArrayList() ;
ArrayList fileOis = new ArrayList() ;
// 添加返回字段设置
fileName.add("rcol1") ;
fileOis.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector) ;
fileName.add("rcol2") ;
fileOis.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector) ;
fileName.add("rcol3") ;
fileOis.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector) ;
// 将返回字段设置到UDTF的返回值类型中
return ObjectInspectorFactory.getStandardStructObjectInspector(fileName, fileOis) ;
}
}
fileName = new ArrayList() ;
ArrayList fileOis = new ArrayList() ;
// 添加返回字段设置
fileName.add("rcol1") ;
fileOis.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector) ;
fileName.add("rcol2") ;
fileOis.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector) ;
fileName.add("rcol3") ;
fileOis.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector) ;
// 将返回字段设置到UDTF的返回值类型中
return ObjectInspectorFactory.getStandardStructObjectInspector(fileName, fileOis) ;
}
}
3、UDAF(对流量进行聚合)
public class MyUDAF extends UDAF {
public static class MaxNumberUDAFEvaluator implements UDAFEvaluator {
private IntWritable result;
public void init() {
result = null;
}
// 聚合的多行的被聚合的值都会调用一次iterate方法,所以这个方法里面我们定义聚合规则
public boolean iterate(IntWritable value) {
if (value == null) {
return false;
}
if (result == null) {
result = new IntWritable(value.get());
} else {
// 需求是求出流量最大值,在这里进行流量值得比较,将最大值放入result
result.set(Math.max(result.get(), value.get()));
}
return true;
}
// hive需要部分聚合结果时会调用该方法,返回当前的result作为hive取部分聚合值得结果
public IntWritable terminatePartial() {
return result;
}
// 聚合值,新行未被处理的值会调用merge加入聚合,在这里直接调用上面定义的聚合规则方法iterate
public boolean merge(IntWritable other) {
return iterate(other);
}
// hive需要最终聚合结果是调用方法,返回聚合的最终个结果
public IntWritable terminate() {
return result;
}
}
}
附:部分数据格式
27.19.74.143 - - [29/April/2016:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127
110.52.250.126 - - [29/April/2016:17:38:20 +0800] "GET /data/cache/style_1_widthauto.css?y7a HTTP/1.1" 200 1292
27.19.74.143 - - [29/April/2016:17:38:20 +0800] "GET /static/image/common/hot_1.gif HTTP/1.1" 200 680
27.19.74.143 - - [29/April/2016:17:38:20 +0800] "GET /static/image/common/hot_2.gif HTTP/1.1" 200 682
27.19.74.143 - - [29/April/2016:17:38:20 +0800] "GET /static/image/filetype/common.gif HTTP/1.1" 200 90
110.52.250.126 - - [29/April/2016:17:38:20 +0800] "GET /source/plugin/wsh_wx/img/wsh_zk.css HTTP/1.1" 200 1482
110.52.250.126 - - [29/April/2016:17:38:20 +0800] "GET /data/cache/style_1_forum_index.css?y7a HTTP/1.1" 200 2331
110.52.250.126 - - [29/April/2016:17:38:20 +0800] "GET /source/plugin/wsh_wx/img/wx_jqr.gif HTTP/1.1" 200 1770
27.19.74.143 - - [29/April/2016:17:38:20 +0800] "GET /static/image/common/recommend_1.gif HTTP/1.1" 200 1028
110.52.250.126 - - [29/April/2016:17:38:20 +0800] "GET /static/image/common/logo.png HTTP/1.1" 200 4542