Hadoop多Job串联
使用场景
复杂的业务逻辑单个mapreduce程序无法完成,需要多个mapreduce程序串联处理。
例子
- 多个数据文件,首先需要合并数据文件
- 对合并后的数据文件进行业务处理
实现思路
多Job
串联执行可以通过mapreduce
框架的JobControl
实现。
案例
Depend.java
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import java.io.IOException;
/**
* HadoopLesson
* Depend
* 2020/2/28 10:53
**/
public class Depend {
public static void main(String[] args) throws IOException, InterruptedException {
//需求: 先完成宠物的序列化排序,把结果输出(CatDriver),然后统计上一步输出结果的单词数量(WordCountDriver)
//先通过工厂模式完成Cat,WordCount Job的构建
Job job1 = JobFactory.getCatJob();
Job job2 = JobFactory.getWordCountJob();
//定义ControlledJob,用来设定Job之间依赖关系,并且构造器传入要包装的job的conf
ControlledJob controlledJob1 = new ControlledJob(job1.getConfiguration());
ControlledJob controlledJob2 = new ControlledJob(job2.getConfiguration());
//对ControlledJob进行原生的Job的设置
controlledJob1.setJob(job1);
controlledJob2.setJob(job2);
//设置两个Job之间的依赖关系
controlledJob2.addDependingJob(controlledJob1);
// 定义JobControl并且把新建的ControlledJOb加入,类似于组的概念
JobControl jobControl = new JobControl("UserJObControl");
jobControl.addJob(controlledJob1);
jobControl.addJob(controlledJob2);
//把JobControl传入到多线程的构建函数中,用多线程的方式启动JobControl
Thread thread = new Thread(jobControl);
//通过启动多线程赖启动多个Job
thread.start();
//等待所有线程都完成后再停止JobCtrol
while (!jobControl.allFinished()) {
Thread.sleep(800);
}
//关闭当前JobControl
jobControl.stop();
}
}
JobFactory.java
import com.jxyy.hadopp.mr.my.wordcount.WordCountDriver;
import com.jxyy.hadopp.mr.my.wordcount.WordCountMapper;
import com.jxyy.hadopp.mr.my.wordcount.WordCountReducer;
import com.jxyy.hadopp.mr.sample.cat.Cat;
import com.jxyy.hadopp.mr.sample.cat.CatDriver;
import com.jxyy.hadopp.mr.sample.cat.CatMapper;
import com.jxyy.hadopp.mr.sample.cat.CatReducer;
import com.jxyy.hadopp.mr.sample.partition.PhonePartitioner;
import com.jxyy.hadopp.mr.sample.util.FolderUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* HadoopLesson
* JobFactory
* 2020/2/28 23:57
* Job工厂类
**/
public class JobFactory {
public static Job getCatJob() throws IOException {
Configuration configuration = new Configuration();
// configuration.set("key","value");
// 2.获取一个Job,通过配置文件获取
Job job = Job.getInstance(configuration);
// 3.指定jar的位置
job.setJarByClass(CatDriver.class);
// 4.指定Mapper运行类
job.setMapperClass(CatMapper.class);
// 5.指定Mapper输出的key的类型
job.setMapOutputKeyClass(Cat.class);
// 6.指定Mapper输出的value的类型
job.setMapOutputValueClass(NullWritable.class);
// 7.指定Reducer运行类
job.setReducerClass(CatReducer.class);
// 8.指定Reducer输出的key的类型
job.setOutputKeyClass(Cat.class);
// 9.指定Reducer输出value的类型
job.setOutputValueClass(NullWritable.class);
// 通过Job设置自定义分区类
job.setPartitionerClass(PhonePartitioner.class);
// 设置Reducer的数量
//NumReduceTasks的设定决定了输出为几个分区,优先级大于自定义分区的数量
//分区数为4,但设定reduceTask数量大于自定义分区数,那么可以正常输出,多余出来的分区中数据为0
//分区数为4 ,但是设定reduceTask数量小于自定义分区数(2),系统会报错,
//如果NumRedueeTasks设为1(或者不设,默认就是1),那么系统就根本不会去找自定义分区类,而是返回1-1=0号分区
job.setNumReduceTasks(4);
String outputString = "d://catoutput";
String inputString = "d://catinput";
FolderUtil.delFolder(outputString);
// 10.指定输入文件夹的路径
FileInputFormat.setInputPaths(job, new Path(inputString));
// 11.指定输出文件夹的路径
FileOutputFormat.setOutputPath(job, new Path(outputString));
return job;
}
public static Job getWordCountJob() throws IOException {
Configuration configuration = new Configuration();
// 2.获取一个Job,通过配置文件获取
Job job = Job.getInstance(configuration);
// 3.指定jar的位置
job.setJarByClass(WordCountDriver.class);
// 4.指定Mapper运行类
job.setMapperClass(WordCountMapper.class);
// 5.指定Mapper输出的key的类型
job.setMapOutputKeyClass(Text.class);
// 6.指定Mapper输出的value的类型
job.setMapOutputValueClass(IntWritable.class);
// 7.指定Reducer运行类
job.setReducerClass(WordCountReducer.class);
// 8.指定Reducer输出的key的类型
job.setOutputKeyClass(Text.class);
// 9.指定Reducer输出value的类型
job.setOutputValueClass(IntWritable.class);
String outputString = "d://wordcoutOutput";
String inputString = "d://catoutput";
// 10.指定输入文件夹的路径
FileInputFormat.setInputPaths(job, new Path(inputString));
// 11.指定输出文件夹的路径
FileOutputFormat.setOutputPath(job, new Path(outputString));
return job;
}
}