udf类
package com.test.film;
import org.apache.hadoop.hive.ql.exec.UDF;
/*
* 功能:获取电影新闻
*/
public class GetFilmNews extends UDF {
public GetFilmNews() {
}
public String evaluate(String id, String name, String title, String author,
String publish_time_string, String release_date,
String release_status, String url, String summary, String body) {
StringBuilder line_result_sb = new StringBuilder();
// url
if(null == url || url.trim().equals("") || url.equals("-")){
return "";
}
// title
if(null == title || title.trim().equals("") || title.equals("-")){
return "";
}
title = title.trim();
title = title.replaceAll("·", "·");
title = title.replaceAll(""", "\"");
// summary
if(null == summary || summary.trim().equals("")){
summary = "";
}
summary = summary.trim();
summary = summary.replaceAll("\n|\r", "").replaceAll("·", "·").trim();
// body
if(null == body || body.trim().equals("")){
body = "";
}
body = body.trim();
body = body.replaceAll("\n|\r", "").replaceAll("·", "·").trim();
if(summary.equals("") && body.equals("")){
return "";
}
// 如果body为空,summary不为空,则用summary作为body
if(body.equals("") && !summary.equals("") ){
body = summary;
}
if(body.equals("") || body.equals("-")){
return "";
}
line_result_sb.append(id);
line_result_sb.append("\t");
line_result_sb.append(name);
line_result_sb.append("\t");
line_result_sb.append(title);
line_result_sb.append("\t");
line_result_sb.append(body);
return line_result_sb.toString();
}
}
hivesql:
CLASSIFIER_JAR="/bigdata/Jar_GetDataFromHive/getdatafromhive-0.0.1-SNAPSHOT.jar"
hive -e "add jars $CLASSIFIER_JAR;
create temporary function getFilmNews as 'com.test.film.GetFilmNews';
set mapred.reduce.tasks=100;
set hive.map.aggr=true;
set mapred.job.priority=NORMAL;
use dmm;
select getFilmNews(t.id, t.name, t.title, t.author, t.publish_time_string, t.release_date, t.release_status, t.url, t.summary, t.body)
from
(select e.id as id, b.name as name, e.title as title, e.author as author, e.publish_time_string as publish_time_string, b.release_date as release_date, b.release_status as release_status, e.url as url, e.summary as summary, e.body as body
from
dmm.web_data e join dmm.movie_info b on (e.id=b.id) where e.media_type !=-1 and b.release_status != 2) t;
">./filmnews.txt