hadoop入门之mapreduce终结篇-Mapreduce案例（六）

最新推荐文章于 2025-08-19 01:33:02 发布

原创最新推荐文章于 2025-08-19 01:33:02 发布 · 236 阅读

0 ·

CC 4.0 BY-SA版权

hadoop 同时被 3 个专栏收录

16 篇文章

订阅专栏

hadoop

16 篇文章

订阅专栏

mapreduce

1 篇文章

订阅专栏

本文深入探讨了Hadoop MapReduce(MR)在实际业务场景中的应用，包括使用MR实现join操作、处理数据倾斜以及求解两两之间的共同好友等复杂问题。通过具体案例，文章详细解析了MR程序设计的关键点，尤其是对key的精准控制，展示了如何通过1-2个MR流程高效解决问题。

标签（空格分隔）： hadoop

简介

本节主要是针对hdfs在业务中的日常应用而讲解的一些案例，用于训练我们在使用hdfs的方式。（ps:mr程序的本质是根据规则做数据拆分，之后根据key做好reduce的分组操作）

1 案例

数据存放：链接：https://pan.baidu.com/s/1nsAcNdWE_glFqyx4AJ-GVg
提取码：lkdr

1.1 使用mr实现 join

1.数据准备：join主要包含两方面的数据，班级和学员信息
班级信息

班级id	班级
1	1班
2	2班

学员信息
10,yifang,15,1 学员id 学员名称年龄所属班级id

2.案例分析
需要将此两种信息汇总到一起，我们如何实现join呢，我们需要控制好key的流转即可，因为reduce本身会根据key做好分组，如果我们控制班级id的key作为1组的话，那么就可以实现学员信息和班级信息的join。
3.code
可去 github mr/join下查看代码

package com.lcy.hadoop.mr.join;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import com.lcy.hadoop.mr.flowsum.FlowBean;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created by luo on 2019/6/2.
 */
public class JoinDriver {

    static class JoinMapper extends     Mapper<LongWritable,Text,IntWritable,JoinBean>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] splictValuse = value.toString().split(",");
        //通过文件名称来判断该文件时class还是student,这个信息在构建mr之前会放入到context中
        FileSplit splict = (FileSplit)context.getInputSplit();
        String name = splict.getPath().getName();
        JoinBean joinBean = null;
        int cid;
        if(name.contains("class")){
            //如果是class则设置class相关信
            joinBean = new JoinBean(-1,"",-1,Integer.valueOf(splictValuse[0]),splictValuse[1],true);
            cid = Integer.valueOf(splictValuse[0]);
        }else{
            //这部分是student的信息
            joinBean = new JoinBean(Integer.valueOf(splictValuse[0]),splictValuse[1],Integer.valueOf(splictValuse[2]),Integer.valueOf(splictValuse[3]),"",false);
            cid = Integer.valueOf(splictValuse[3]);
        }
        context.write(new IntWritable(cid),joinBean);
    }
}
//reducer主要做数据join之后的输出操作
static class JoinReducer extends Reducer<IntWritable,JoinBean,JoinBean,NullWritable>{
    @Override
    protected void reduce(IntWritable key, Iterable<JoinBean> joinBeans, Context context) throws IOException, InterruptedException {
        //先找出对应的class和student做区分
        JoinBean classBean = new JoinBean();
        List<JoinBean> studentBenas = new ArrayList<>();
        try {
            for(JoinBean bean:joinBeans){
                //这里是挨个序列化，所以bean实际上以最后一个bean的数据会覆盖前面的需要坐下拷贝
                if(bean.isClassFlas()){
                    BeanUtils.copyProperties(classBean,bean);
                }else{
                    JoinBean sBean = new JoinBean();
                    BeanUtils.copyProperties(sBean,bean);
                    studentBenas.add(sBean);
                }
            }
        }catch (Exception e){
            e.printStackTrace();
        }
        for(JoinBean bean:studentBenas){
            bean.setCName(classBean.getCName());
            context.write(bean,NullWritable.get());
        }
    }
}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    conf.set("mapreduce.framework.name","local");
    //操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
    Job job = Job.getInstance(conf);
    job.setJarByClass(JoinDriver.class);

    job.setMapperClass(JoinMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(JoinBean.class);

    job.setReducerClass(JoinReducer.class);
    job.setOutputKeyClass(FlowBean.class);
    job.setOutputValueClass(NullWritable.class);

    FileInputFormat.setInputPaths(job,new Path(args[0]));
    deleteFIle(args[1]);
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    boolean isSuccess = job.waitForCompletion(true);
    System.exit(isSuccess?0:1);
}

private static void deleteFIle(String arg) {
    File file = new File(arg);
    if(file.exists()){
        if(file.isDirectory()){
            String[] files = file.list();
            for(String f : files){
                File fi = new File(file.getParent(),f);
                fi.delete();
            }
            file.delete();
        }else{
            file.delete();
        }
    }
}

}

JoinBean
package com.lcy.hadoop.mr.join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import lombok.Data;
import org.apache.hadoop.io.Writable;

/**

Created by luo on 2019/6/2.
*/
@Data
public class JoinBean implements Writable {
//学员id 学员名称年龄所属班级id
private int sId;
private String sName;
private int sAge;
private int cId; //班级id
private String cName;//班级名称
private boolean isClassFlas;

public JoinBean(int sId, String sName, int sAge, int cId,String cName,boolean isClassFlas) {
this.sId = sId;
this.sName = sName;
this.sAge = sAge;
this.cId = cId;
this.isClassFlas = isClassFlas;
this.cName = cName;

}

public JoinBean() {
}

@Override
public void write(DataOutput output) throws IOException {
output.writeInt(sId);
output.writeUTF(sName);
output.writeInt(sAge);
output.writeInt(cId);
output.writeUTF(cName);
output.writeBoolean(isClassFlas);
}

@Override
public void readFields(DataInput input) throws IOException {
this.sId = input.readInt();
this.sName = input.readUTF();
this.sAge = input.readInt();
this.cId = input.readInt();
this.cName = input.readUTF();
this.isClassFlas = input.readBoolean();
}

@Override
public String toString() {
return “” + sId + ‘\t’ +
sName + ‘\t’ +
sAge + ‘\t’ +
cId + ‘\t’ +
cName;
}
}

1.2 数据倾斜处理思路之去除map

1.数据准备：数据还是刚刚那个数据
2.案例分析：现在假如这个以上方join程序为例子，出现了学生信息很多的情况，我们会进行适当的reduce数量配置，而这时候可能就会出现hash之后的数据在reduce出现某些reducetask处理数据集特别多，而另一部分的数据集特别少的情况下，那么我们怎样才能够解决这种情况呢。（ps:假设在这上面的情况就是班级信息有限，而学生信息无限的情况下），思路即使假如我们可以让map端直接实现join那么是不是就不用怕reduce端数据倾斜呢。而map端在启用的时候都会调用setup之后获取数据调用map最后调用cleanup，我们在setup上面将班级信息做加载，那就能完成我们这个针对数据倾斜的方案。

package com.lcy.hadoop.mr.mapjoin;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

import com.lcy.hadoop.mr.join.JoinBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created by luo on 2019/6/2.
 */
public class MapJoinDriver {

static class MapJoinMapper extends Mapper<LongWritable,Text,JoinBean,NullWritable> {
    Map<Integer,String> classMap = new HashMap();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
  BufferedReader reader = null;
            try {
                reader = new BufferedReader(new InputStreamReader(new FileInputStream("join_class.txt")));
                String line = null;
                String[] valuse;
                while ((line = reader.readLine())!=null){
                    valuse = line.split(",");
                    classMap.put(Integer.valueOf(valuse[0]),valuse[1]);
                }

            }finally {
                reader.close();
            }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] splictValuse = value.toString().split(",");
        //通过文件名称来判断该文件时class还是student,这个信息在构建mr之前会放入到context中
        FileSplit splict = (FileSplit)context.getInputSplit();
        String name = splict.getPath().getName();
        JoinBean joinBean = null;
        int cid;
        if(name.contains("student")){
            joinBean = new JoinBean(Integer.valueOf(splictValuse[0]),splictValuse[1],Integer.valueOf(splictValuse[2]),Integer.valueOf(splictValuse[3]),"",false);
            cid = Integer.valueOf(splictValuse[3]);
            joinBean.setCName(classMap.get(joinBean.getCId()));
            context.write(joinBean,NullWritable.get());
        }

    }
}



public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    conf.set("mapreduce.framework.name","local");
    //操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
    Job job = Job.getInstance(conf);
    job.setJarByClass(MapJoinDriver.class);
    job.addCacheFile(new URI("file:/E:/mr/join/input/join_class.txt"));
    job.setMapperClass(MapJoinMapper.class);
    job.setMapOutputKeyClass(JoinBean.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setOutputKeyClass(JoinBean.class);
    job.setOutputValueClass(NullWritable.class);
    job.setNumReduceTasks(0);
    FileInputFormat.setInputPaths(job,new Path(args[0]));
    deleteFIle(args[1]);
    FileOutputFormat.setOutputPath(job,new Path(args[1]));
    boolean isSuccess = job.waitForCompletion(true);
    System.exit(isSuccess?0:1);
}

private static void deleteFIle(String arg) {
    File file = new File(arg);
    if(file.exists()){
        if(file.isDirectory()){
            String[] files = file.list();
            for(String f : files){
                File fi = new File(file.getParent(),f);
                fi.delete();
            }
            file.delete();
        }else{
            file.delete();
        }
    }
}

}

1.3 求两两之间的共同好友

1.数据准备：
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

求出哪些人两两之间有共同好友，及他俩的共同好友都是谁
比如:
a-b : c ,e
2.案例分析
这道题的思路有很多，我将一下我这边的解题思路。1）先找出所有以当前人为共同好友的所有人选，之后起第二个mr 对共同好友进行排序（因为两两共同好友a-b:c 和b-a:c 是一致的所以我们需要统一处理下），之后输出两人为key，对应的共同好友人为value，在reduce端做下两人拥有的共同好友的所有人输出处理。

3.code （这个案例的代码放在mr/friend里面）
第一步：获取以key为共同好友的所有人

package com.lcy.hadoop.mr.friend;

import java.io.IOException;

import com.lcy.hadoop.mr.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created by luo on 2019/6/2.
 */
public class FriendOneDriver {

//A:b,c,d a有bcd几个好友,我们其实要找的是b的好友有谁c的好友有谁
static class FriendOneMapper extends Mapper<LongWritable,Text,Text,Text> {
    Text fValue  = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        System.out.println(value);
        String[] splictValuse = value.toString().split(":");
        String keyStr = splictValuse[0];
        //获取所有以此为好友的
        String[] frieds = splictValuse[1].split(",");
        fValue.set(keyStr);
        for(String s:frieds){
            context.write(new Text(s),fValue);
        }
    }
}
//
static class FriendOneReducer extends Reducer<Text,Text,Text,Text> {
    /**
     * 现在获取到的就是以key为共同好友的一组人员数据，我们先求出key的共同好友都有谁
     * @param key
     * @param values
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        StringBuilder sb = new StringBuilder();
        for(Text v:values){
            sb.append(v.toString()).append(",");
        }
        context.write(key,new Text(sb.toString()));
    }
}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    conf.set("mapreduce.framework.name","local");
    //操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
    Job job = Job.getInstance(conf);
    job.setJarByClass(FriendOneDriver.class);

    job.setMapperClass(FriendOneMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setReducerClass(FriendOneReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job,new Path(args[0]));
    FileUtils.deleteFile(args[1]);
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    boolean isSuccess = job.waitForCompletion(true);
    System.exit(isSuccess?0:1);
}
}

2.第二步获取所有的两两拥有的共同好友

package com.lcy.hadoop.mr.friend;

import java.io.IOException;
import java.util.Arrays;

import com.lcy.hadoop.mr.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created by luo on 2019/6/2.
 */

public class FriendTwoDriver {

//将key为公共好友的 两两组成一队,value以当前的共同好友为value
static class FriendTwoMapper extends Mapper<LongWritable,Text,Text,Text> {
    Text fValue  = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] splicts = value.toString().split("\t");
        fValue.set(splicts[0]);
        String[] commomFs = splicts[1].split(",");
        Arrays.sort(commomFs);
        String twoPerson = null;
        for(int i = 0;i<commomFs.length-1;i++){
            for(int j = i +1;j<commomFs.length;j++){
                twoPerson = commomFs[i] + "--" + commomFs[j];
                context.write(new Text(twoPerson),fValue);
            }
        }
    }
}
//
static class FriendTwoReducer extends Reducer<Text,Text,Text,Text> {
    /**
     * 现在获取到的就是以key为共同好友的一组人员数据，我们先求出key的共同好友都有谁
     * @param key
     * @param values
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        StringBuilder sb = new StringBuilder();
        for(Text v:values){
            sb.append(v.toString()).append(",");
        }
        String result = sb.toString();
        if(sb.length()>1){
            result = sb.substring(0,sb.length()-1);
        }
        context.write(key,new Text(result));
    }
}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    conf.set("mapreduce.framework.name","local");
    //操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
    Job job = Job.getInstance(conf);
    job.setJarByClass(FriendTwoDriver.class);

    job.setMapperClass(FriendTwoMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setReducerClass(FriendTwoReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setCombinerClass(FriendTwoReducer.class);//由于不影响最终结果所以这里以combiner可以设置，可以提升reduce效率
    FileInputFormat.setInputPaths(job,new Path(args[0]));
    FileUtils.deleteFile(args[1]);
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    boolean isSuccess = job.waitForCompletion(true);
    System.exit(isSuccess?0:1);
}
}