MapReduce元模式之基本作业链

最新推荐文章于 2024-05-30 15:15:00 发布

路人张的鱼生

最新推荐文章于 2024-05-30 15:15:00 发布

阅读量459

点赞数

CC 4.0 BY-SA版权

分类专栏： MapReduce 文章标签：大数据 MapReduce

本文链接：https://blog.youkuaiyun.com/zhangdy12307/article/details/93337257

MapReduce 专栏收录该内容

41 篇文章

订阅专栏

本文围绕MapReduce元模式之基本作业链展开。介绍了其模式描述、目的，分析了性能，指出处理多级作业需手工编码及考虑失败任务处理等。还给出基本作业链示例，包括使用的模式、问题描述、思路，以及各阶段任务、编码和驱动代码等。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

MapReduce元模式之基本作业链

模式描述

元模式：关于模式的模式
元模式不是解决某个特定问题的模式，而是处理模式的模式
作业连：该方法将多种模式结合起来以解决复杂的，多级的问题

目的

解决复杂的，多级的问题，针对于多个不同的mapper阶段和reducer阶段

性能分析

作业链是众多比较难处理的过程的一种，因为它并不是大多数MapReduce框架中包含的特性，Hadoop被设计成能够很好的处理单个MapReduce作业，但是在处理多级作业时则需要加入许多手工编码，在操作上还需要考虑作业的各个阶段如何处理失败任务以及清理中间结果
MapReduce链的主要问题是临时文件的大小，在某些情况下，临时文件可能极小，导致有大量的开销华为在启动很多的map任务去加载这些数据上
对于无链作业，reducer的个数通常更多的依赖于reducer接收到的数据量大小而不是输出的数据量，但当使用链时，输出文件的大小可能更重要，尽量是的输出文件的大小和分布式文件系统上的块的大小一样，这要只要调整reducer的个数就可以看到对性能的影响

基本作业链示例

使用到的模式

MapReduce之计数器计数
 MapReduce之分箱

问题描述

给出一份用户列表，其中包含用户的声望和每个用户的发帖数，并将用户分为两部分，一部分是发帖数在平均值之下，另一部分在平均值之上

问题思路

使用一个作业来执行计数，另一个作业根据用户发帖数将用户分配到来分配到不同的箱中，最终输出包含用户ID和用户的发帖次数

样例输入

创建数据集的代码如下：

import java.io.*;

public class create {
    public static void main(String[] args) throws IOException{
        String path="input/Basic_Chain_of_Work.txt";
        File file=new File(path);
        if(!file.exists()){
            file.getParentFile().mkdirs();
        }
        file.createNewFile();
        FileWriter fw=new FileWriter(file,true);
        BufferedWriter bw=new BufferedWriter(fw);

        for(int i=0;i<5000;i++){
            int id=(int)(Math.random()*1000+1000);
            bw.write("UserId="+id+" reputation="+(int)(Math.random()*5000+1000)+" inviataion="+(int)(Math.random()*3000+1000)+"\n");
        }
        bw.flush();
        bw.close();
        fw.close();
    }
}

在这里插入图片描述

运行结果如下

在这里插入图片描述

第一个mapper阶段任务

将用户ID作为键，计数值1为作业的输出值，AVERAGE_CALC_GROUP是驱动程序中的字符串，RECORDS_COUNTER_NAME作为总帖子数的计数器

第一个mapper阶段编码

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.Map;

import static com.deng.Basic_chain_of_Work.Driver.AVERAGE_CALC_GROUP;
import static com.deng.Basic_chain_of_Work.UserIdBinningMapper.AVERAGE_POSTS_PER_USER;

public class UserIdCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
    public static final String RECORDS_COUNTER_NAME="Records";
    private static final LongWritable ONE=new LongWritable(1);
    public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException{
        Map<String,String> parsed=MRDPUtil.transInformation(value.toString());
        String userId=parsed.get("UserId");
        if(userId!=null){
            context.write(new Text(userId),ONE);
            context.getCounter(AVERAGE_CALC_GROUP,RECORDS_COUNTER_NAME).increment(1);
        }
    }
}

第一个reducer作业任务

统计所有用户的发帖数并记录总用户的人数

第一个reducer作业编码


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

import static com.deng.Basic_chain_of_Work.Driver.AVERAGE_CALC_GROUP;
import static com.deng.Basic_chain_of_Work.UserIdCountMapper.RECORDS_COUNTER_NAME;

public class UserIdSumReducer extends Reducer<Text, LongWritable,Text,LongWritable> {
    public static final String USERS_COUNTER_NAME="Users";
    private LongWritable outvalue=new LongWritable();
    public void reduce(Text key,Iterable<LongWritable> values,Context context) throws IOException,InterruptedException{
        context.getCounter(AVERAGE_CALC_GROUP,USERS_COUNTER_NAME).increment(1);
        int sum=0;
        for(LongWritable value:values){
            sum+=value.get();
        }
        context.write(key,new LongWritable(sum));
    }
}

第二个作业的mapper阶段任务

在setup阶段的任务

从context对象中获取每个用户的平均发帖数
初始化MultipleOutputs对象

在map阶段的任务
获取用户的发帖数，将发帖数与平均发帖数对比并把用户放到合适的箱中
在clearup阶段
MultipleOutputs关闭

第二个作业的mapper阶段编码


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import java.io.*;

import static com.deng.Basic_chain_of_Work.Driver.MULTIPLE_OUTPUTS_ABOVE_NAME;
import static com.deng.Basic_chain_of_Work.Driver.MULTIPLE_OUTPUTS_BELOW_NAME;

public class UserIdBinningMapper extends Mapper<LongWritable,Text,Text,Text> {
    public static final String AVERAGE_POSTS_PER_USER="avg.posts.per.user";
    static void setAveragePostsPerUser(Job job,double avg){
        job.getConfiguration().set(AVERAGE_POSTS_PER_USER,Double.toHexString(avg));
    }

    static double getAveragePostsPerUser(Configuration conf){
        return Double.parseDouble(conf.get(AVERAGE_POSTS_PER_USER));
    }

    private double average=0.0;
    private MultipleOutputs<Text,Text> mos=null;
    private Text outKey=new Text(),outValue=new Text();

    protected void setup(Context context) throws IOException,InterruptedException{
        average=getAveragePostsPerUser(context.getConfiguration());
        mos=new MultipleOutputs<Text,Text>(context);
    }

    public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException{
        String[] tokens=value.toString().split("\t");
        String userId=tokens[0];
        int posts=Integer.parseInt(tokens[1]);
        outKey.set("UserId = "+userId);
        outValue.set("total posts is "+(long) posts+"\t");
        if((double) posts<average){
            mos.write(MULTIPLE_OUTPUTS_BELOW_NAME,outKey,outValue,MULTIPLE_OUTPUTS_BELOW_NAME+"/part");
        }else{
            mos.write(MULTIPLE_OUTPUTS_ABOVE_NAME,outKey,outValue,MULTIPLE_OUTPUTS_ABOVE_NAME+"/part");
        }
    }

    protected void cleanup(Context context) throws IOException,InterruptedException{
        mos.close();
    }
}

驱动代码

在运行过程中会创建中间目录，该目录在整个作业链结束会被程序自动删除
在配置第二个作业前，先从第一个作业中获取计数器的值，并计算每个用户的平均发帖次数，然后添加到第二个作业的配置中

package com.deng.Basic_chain_of_Work;


import com.deng.FileUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class Driver {
    public static final String MULTIPLE_OUTPUTS_BELOW_NAME="BelowAveage";
    public static final String MULTIPLE_OUTPUTS_ABOVE_NAME="AboveAverage";
    public static final String AVERAGE_CALC_GROUP="Is_AVERAGE";
    public static void main(String[] args) throws Exception{
        Configuration conf=new Configuration();
        String[] otherArgs=new String[]{"input/Basic_Chain_of_Work.txt","output","output2"};
        Job countingJob=new Job(conf,"CountJob");
        countingJob.setJarByClass(Driver.class);
        countingJob.setMapperClass(UserIdCountMapper.class);
        countingJob.setReducerClass(UserIdSumReducer.class);
        countingJob.setOutputValueClass(LongWritable.class);
        countingJob.setOutputKeyClass(Text.class);
        FileInputFormat.addInputPath(countingJob,new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(countingJob,new Path(otherArgs[1]));
        int code=countingJob.waitForCompletion(true)?0:1;
        if(code==0){
            double numRecords=(double) countingJob.getCounters().findCounter(AVERAGE_CALC_GROUP,UserIdCountMapper.RECORDS_COUNTER_NAME).getValue();
            double numUsers=(double) countingJob.getCounters().findCounter(AVERAGE_CALC_GROUP, UserIdSumReducer.USERS_COUNTER_NAME).getValue();
            double averagePostsPerUser=numRecords/numUsers;
            System.out.println("the averagePostsPerUser is "+averagePostsPerUser);
            Job binningJob=new Job(new Configuration(),"binningJob");
            binningJob.setJarByClass(Driver.class);
            UserIdBinningMapper.setAveragePostsPerUser(binningJob,averagePostsPerUser);
            binningJob.setMapperClass(UserIdBinningMapper.class);
            binningJob.setNumReduceTasks(0);
            binningJob.setInputFormatClass(TextInputFormat.class);
            String ano=otherArgs[1]+"/part-r-00000";
            TextInputFormat.addInputPath(binningJob,new Path(ano));
            MultipleOutputs.addNamedOutput(binningJob,MULTIPLE_OUTPUTS_BELOW_NAME,TextOutputFormat.class,Text.class,Text.class);
            MultipleOutputs.addNamedOutput(binningJob,MULTIPLE_OUTPUTS_ABOVE_NAME,TextOutputFormat.class,Text.class,Text.class);
            TextOutputFormat.setOutputPath(binningJob,new Path(otherArgs[2]));
            code=binningJob.waitForCompletion(true)?0:1;
        }
        FileSystem.get(conf).delete(new Path("output"),true);
        System.exit(code);
    }
}