1.顺序链接MapReduce作业
类似于Unix中的管道:
mapreduce-1 | mapreduce-2 | mapreduce-3 ......
每一个阶段创建一个job,并将当前输入路径设为前一个的输出。在最后阶段删除链上生成的中间数据。
2.具有复杂依赖的MapReduce链接
若mapreduce-1处理一个数据集,MapReduce-2处理另一个数据集,而MapReduce-3对前两个做内部连结。
这种情况通过Job和JobControl类管理非线性作业间的依赖。如x.addDependingJob(y)意味着x在y完成前不会启动。
3.预处理和后处理的链接
一般将预处理和后处理写为Mapper任务。可以自己进行链接或使用ChainMapper和ChainReducer类进行链接,生成的作业类似于伪正则表达式:
MAP+ | REDUCE | MAP*
4.下面使用一个案例来说明Hadoop中的链接MapReduce作业之预处理和后处理阶段的链接。
描述如下:有4个mappper(Map1,Map2,Map3,Map4)和一个reducer(Reduce),它们被链接为单个MapReduce作业。
顺序以下: Map1 | Map2 | Reduce | Map3 | Map4
在这个组合中,可以把Map2和Reduce视为MapReduce作业的核心,在Mapper和Reducer之间使用标准的分区和洗牌。把Map1作为前处理步骤,Map3, Map4作为后处理步骤。
在driver中使用ChainMapper和ChainReducer类来设定这个mapper类和reducer类序列的构成。
ChainMapper使用模式:(预处理作业)
ChainReducer使用模式:(设置Reducer并添加后处理Mapper)
源码如下:
package com.yc.link;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyJobLink extends Configured implements Tool{
/**
* Reducer任务 Reduce
* @author hadoop
*
*/
public static class Reduce extends Reducer<LongWritable,Text,Text,Text>{
@Override
public void reduce(LongWritable key, Iterable<Text> values,Context context) throws IOException, InterruptedException {
context.write(new Text("1"), new Text("1"));
}
}
/**
* Mapper任务 Map1
* @author hadoop
*
*/
public static class Map1 extends Mapper<LongWritable,Text,Text,Text>{
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(value, new Text(key.toString()));// V1(记录)作键K2,K1(偏移量)作值V2
}
}
/**
* Mapper任务 Map2
* @author hadoop
*
*/
public static class Map2 extends Mapper<Text,Text,LongWritable,Text>{
@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
context.write(new LongWritable(Long.valueOf(value.toString())), key);// 输入键值对交换后作键值对输出
}
}
/**
* Mapper任务 Map3
* @author hadoop
*
*/
public static class Map3 extends Mapper<Text,Text,LongWritable,Text>{
@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
context.write(new LongWritable(Long.valueOf("1")), key);// 输入键值对后输出键为1,值为输入键
}
}
/**
* Mapper任务 Map4
* @author hadoop
*
*/
public static class Map4 extends Mapper<LongWritable,Text,LongWritable,Text>{
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(new LongWritable(Long.valueOf("1")), new Text("1"));// 输入键值对后输出键为1,值为1
}
}
/**
* driver类
*/
@Override
public int run(String[] args) throws Exception {
// 1.实例化作业对象
Configuration conf=this.getConf();
Job job=new Job(conf,"ChainJob");
job.setJarByClass(MyJobLink.class);
// 2.为作业设置输入文件和输出文件的路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//3.为作业设置输入文本的格式化和输出文本的格式化
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// 4.为作业设置Mapper 和Reducer函数
//(1)在作业中添加Map1阶段, 使用ChainMapper.addMapper()添加位于Reduce之前的步骤
//(job, klass, inputKeyClass, inputValueClass, outputKeyClass, outputValueClass, mapperConf
Configuration map1Conf=new Configuration(false);
ChainMapper.addMapper( job,
Map1.class,
LongWritable.class,
Text.class,
Text.class,
Text.class,
map1Conf);
// (2)在作业中添加Map2阶段, 使用ChainMapper.addMapper()添加位于Reduce之前的步骤
Configuration map2Conf=new Configuration(false);
ChainMapper.addMapper( job,
Map2.class,
Text.class,
Text.class,
LongWritable.class,
Text.class,
map2Conf);
//(3)在作业中添加Reduce阶段,使用ChainReducer.setReducer()方法设置Reducer
Configuration reduceConf=new Configuration(false);
//job, klass, inputKeyClass, inputValueClass, outputKeyClass, outputValueClass, reducerConf
ChainReducer.setReducer(job,
Reduce.class,
LongWritable.class,
Text.class,
Text.class,
Text.class,
reduceConf);
//(4)在作业中添加Map3阶段,使用ChainReducer.addMapper()添加reducer后续的步骤
Configuration map3Conf=new Configuration(false);
ChainReducer.addMapper( job,
Map3.class,
Text.class,
Text.class,
LongWritable.class,
Text.class,
map3Conf);
// (5)在作业中添加Map4阶段,使用ChainReducer.addMapper()添加reducer后续的步骤
Configuration map4Conf=new Configuration(false);
ChainReducer.addMapper( job,
Map4.class,
LongWritable.class,
Text.class,
LongWritable.class,
Text.class,
map4Conf);
// 5.启动作业
return (job.waitForCompletion(true)?0:1);
}
/**
* 主函数
* @param args
* @throws Exception
*/
public static void main(String [] args) throws Exception{
int res=ToolRunner.run(new Configuration(), new MyJobLink(), args);
System.exit(res);
}
}
运行结果为: