在MR的时候经常会遇到多数据源join的问题,如果简单的分析任务采用hive处理就好,如果复杂一点需要自己写MR。
多数据源采用MultipleInputs类的addInputPath方法添加。
Job类
public class EfcOrderProRangeOdJob extends Configured implements Tool {
//TODO 路径
private final static String INTPUT_A = "D:/order/order/";
private final static String INTPUT_B = "D:/order/address/";
private final static String OUTPUT = "D:/testAAAAA/";
// private final static String OUTPUT = "/warehouse/tmp/pt_eft_order_pro_range/";
private final static String OUTPUT_TABLE = "fct_pt_icr_trade_day";
public static void main(String[] args) {
try {
int res = ToolRunner.run(new Configuration(), new EfcOrderProRangeOdJob(), args);
System.exit(res);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public int run(String[] args) throws Exception {
try {
String start = "20130217"; //TODO
Configuration conf = ConfUtil.getConf(getConf());
conf.set("start", start);
Job job1 = Job.getInstance(conf, "pt_eft_order_pro_range_first");
Path pathOrder = new Path(INTPUT_A);
Path pathAddress = new Path(INTPUT_B);
Path output = new Path(OUTPUT + start + "/");
FileSystem fs = FileSystem.get(conf);
if(fs.exists(output)){
fs.delete(output,true);
}
job1.setMapOutputKeyClass(TextPair.class);
job1.setMapOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job1, output);
MultipleInputs.addInputPath(job1, pathOrder, TextInputFormat.class, EfcOrderProRangeOrderMapper.class);
MultipleInputs.addInputPath(job1, pathAddress, TextInputFormat.class, EfcOrderProRangeAddressMapper.class);
job1.setReducerClass(EfcOrderProRangeReducer.class);
job1.setJarByClass(EfcOrderProRangeOdJob.class);
Job job2 = Job.getInstance(conf,"pt_eft_order_pro_range_second");
FileInputFormat.setInputPaths(job2, output);
job2.setMapperClass(EfcOrderProRangeSecondMapper.class);
job2.setMapOutputKeyClass(Text.class);
job2.setMapOutputValueClass(IntWritable.class);
TableMapReduceUtil.initTableReducerJob(OUTPUT_TABLE, EfcOrderProRangeSecondReducer.class, job2);
return JobChainHandler.handleJobChain(job1, job2, "pt_eft_order_pro_range");
} catch (Exception e) {
e.printStackTrace();
return 0;
}
}
public static class TextPair implements WritableComparable<TextPair> {
private Text first;
private Text second;
public TextPair() {
set(new Text(), new Text());
}
public TextPair(String first, String second) {
set(new Text(first), new Text(second));
}
public TextPair(Text first, Text second) {
set(first, second);
}
public void set(Text first, Text second) {
this.first = first;
this.second = second;
}
public Text getFirst() {
return first;
}
public Text getSecond() {
return second;
}
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
public int compareTo(TextPair tp) {
return first.compareTo(tp.first);
}
}
}
mapper1类
public class EfcOrderProRangeOrderMapper extends Mapper<LongWritable, Text, TextPair, Text>{
private static final int ORDER_ID_INDEX = 2;
private static final int ORDER_STATUS_INDEX = 5;
private static final String EFFECTIVE_STATUS = "3";
private static final String COL_SPLITER = "\001";
@Override
public void map(LongWritable key, Text value, Context context) {
try {
String [] order = value.toString().split(COL_SPLITER);
String orderId = order[ORDER_ID_INDEX];
String status = order[ORDER_STATUS_INDEX];
if(!EFFECTIVE_STATUS.equals(status)){
return;
}
TextPair textPair = new TextPair(new Text(orderId),new Text("order"));
context.write(textPair, new Text(status));
} catch (Exception e) {
e.printStackTrace();
}
}
}
mapper2类
public class EfcOrderProRangeAddressMapper extends Mapper<LongWritable, Text, TextPair, Text>{
//TODO 通过hivemeta去取index
private static final int ORDER_ID_INDEX = 0;
private static final int PROVINCE_ID_INDEX = 1;
private static final String COL_SPLITER = "\001";
@Override
public void map(LongWritable key, Text value, Context context) {
try {
String [] address = value.toString().split(COL_SPLITER);
String orderId = address[ORDER_ID_INDEX];
String province = address[PROVINCE_ID_INDEX];
TextPair textPair = new TextPair(new Text(orderId),new Text("address"));
context.write(textPair, new Text(province));
} catch (Exception e) {
e.printStackTrace();
}
}
}
reducer端做join操作,通过TextPair中的second来获取来源,取得需要取得的维度。
public class EfcOrderProRangeReducer extends Reducer<TextPair,Text,Text,Text>{
private static final String COL_SPLITER = "\001";
@Override
protected void reduce(TextPair key, Iterable<Text> values, Context context) {
try {
Text tag = key.getSecond();
Text orderId = key.getFirst();
String status = null;String province = null;
StringBuilder out = new StringBuilder();
for (Text value : values) {
if(tag.toString().equals("order")){
status = value.toString();
}
if(tag.toString().equals("address")){
province = value.toString();
}
}
if (province != null && status != null){
out.append(orderId.toString()).append(COL_SPLITER).append(status).append(COL_SPLITER).append(province);
context.write(null, new Text(out.toString()));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}