mapreduce系列（1）---入门案例深入分析以及切片源码简析

本文链接：https://blog.youkuaiyun.com/tianjun2012/article/details/62444281

本文介绍了一个基于Hadoop MapReduce实现的流量统计程序，并通过自定义Bean实现了按流量大小排序的功能。此外，还探讨了如何优化大量小文件处理的问题。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

一、入门案例

自定义一个mapreduce程序（自定义分区）：
FlowBean.java(实现hadoop的序列化）

package lltj;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 自定义bean
 * Created by tianjun on 2017/3/14.
 */
public class FlowBean implements WritableComparable<FlowBean> {

    long upflow;
    long downflow;
    long sumflow;

    public FlowBean() {
    }

    public FlowBean(long upflow, long downflow) {
        this.upflow = upflow;
        this.downflow = downflow;
        this.sumflow = upflow + downflow;
    }

    public long getUpflow() {
        return upflow;
    }

    public void setUpflow(long upflow) {
        this.upflow = upflow;
    }

    public long getDownflow() {
        return downflow;
    }

    public void setDownflow(long downflow) {
        this.downflow = downflow;
    }

    public long getSumflow() {
        return sumflow;
    }

    public void setSumflow(long sumflow) {
        this.sumflow = sumflow;
    }

    @Override
    public String toString() {
        return  upflow +
                "\t" + downflow +
                "\t" + sumflow ;
    }

    @Override
    public int compareTo(FlowBean o) {
        //自定义倒序比较规则
        return sumflow > o.getSumflow() ? -1 : 1;
    }

    //序列化，将对象的字段信息写入输出流
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeLong(upflow);
        out.writeLong(downflow);
        out.writeLong(sumflow);
    }

    //反序列化，从输入流中读取各个字段信息
    @Override
    public void readFields(DataInput in) throws IOException {

        upflow = in.readLong();
        downflow = in.readLong();
        sumflow = in.readLong();

    }
}

FlowCount.java

package lltj;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * mapper、reducer、job
 * Created by tianjun on 2017/3/14.
 */
public class FlowCount {

    //mapper
    static class FlowCountMapper extends Mapper<LongWritable,Text,Text,FlowBean>{

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] fields = line.split("\t");
            try{
                String phonenbr = fields[1];
                long upflow = Long.parseLong(fields[fields.length-3]);
                long dflow = Long.parseLong(fields[fields.length-2]);
                FlowBean flowBean = new FlowBean(upflow,dflow);
                context.write(new Text(phonenbr), flowBean);
            }catch (Exception e){
                e.printStackTrace();
            }
        }
    }

    //reducer
    static class FlowCountReducer extends Reducer<Text,FlowBean,Text,FlowBean>{
        @Override
        protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
            long sum_upFlow = 0;
            long sum_dFlow = 0;

            //遍历所有的bean，将其上下行流量分别相加
            for(FlowBean bean : values){
                sum_upFlow += bean.getUpflow();
                sum_dFlow += bean.getDownflow();
            }

            FlowBean resultBean = new FlowBean(sum_upFlow,sum_dFlow);
            context.write(key,resultBean);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        String os = System.getProperty("os.name").toLowerCase();
        if(os.contains("windows")){
            System.setProperty("HADOOP_USER_NAME","root");
        }

        Configuration conf = new Configuration();

        conf.set("mapreduce.framework.name","local");
        conf.set("mapreduce.jobtracker.address","local");
        conf.set("fs.defaultFS","file:///");

        Job job = Job.getInstance(conf);

        job.setJarByClass(FlowCount.class);

        job.setMapperClass(FlowCountMapper.class);
        job.setReducerClass(FlowCountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        //自定义patition同时制定相应数量的reducetask
        job.setPartitionerClass(ProvincePartitioner.class);
        job.setNumReduceTasks(5);

        FileInputFormat.setInputPaths(job,new Path("hdfs://mini01:9000/input/flow.log"));
        FileOutputFormat.setOutputPath(job,new Path("hdfs://mini01:9000/wc/output/flow1"));

        boolean res = job.waitForCompletion(true);
        System.exit(res ? 0 : 1);

    }
}

ProvincePartitioner.java

package lltj;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import java.util.HashMap;

/**
 * k v 对应map的输出
 * 定义自己的从map到reduce之间的数据（分组）分发规则
 * 按照手机号所属的省份来分发（分组）ProvincePartitioner
 * 默认的分组组件是HashPartitioner
 * Created by tianjun on 2017/3/14.
 */
public class ProvincePartitioner extends Partitioner<Text,FlowBean> {

    static HashMap<String, Integer> provinceMap = new HashMap<>();
    static {
        provinceMap.put("136",0);
        provinceMap.put("137",1);
        provinceMap.put("138",2);
        provinceMap.put("139",3);
    }

    /**
     * k v 对应map的输出
     * @param key
     * @param bean
     * @param numPartitions
     * @return
     */
    @Override
    public int getPartition(Text key, FlowBean bean, int numPartitions) {
        Integer code = provinceMap.get(key.toString().substring(0,3));
        return code == null ? 4:code;
    }
}

二、切片源码简析

源码简析：
首先由waitForCompletion–>
job.submit()–>
JobSubmiter(成员：（
================= Cluster(成员：
======================== proxy（其中一种就是yarnRunner,如果本地就是localJobRunner）
======================）
）–>
调用FileInputFormat.getSplits()获取切片规划的List—>
序列化生成job.split
—>
将job相关参数写道job.xml文件

原理图如下：

这里写图片描述

三、入门案例增加

随便打开一个最后的结果：

[root@mini01 ~]# hdfs dfs -cat /wc/output/flow1/part-r-00004
13480253104     180     180     360
13502468823     7335    110349  117684
13560436666     1116    954     2070
13560439658     2034    5892    7926
15013685858     3659    3538    7197
15920133257     3156    2936    6092
15989002119     1938    180     2118
18211575961     1527    2106    3633
18320173382     9531    2412    11943
84138413        4116    1432    5548

可以观察到，并没有按照总流量的大小排序，现在我们在这个基础上解决这个问题.
思路：
由于mapTask中，key是有序的，所以在map中我们可以把flowBean变为key，这样只需要我们重写flowBean继承序列化接口中的compareTo（）函数即可轻松实现排序的要求，最后，我们reduce汇总到一个文件里，这样我们所有的数据都按照有序排列了。
FlowCountSort.java

package lltj;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * 在流量统计的基础上进行排序
 * Created by tianjun on 2017/3/17.
 */
public class FlowCountSort {


    static class FlowCountSortMapper extends Mapper<LongWritable,Text,FlowBean,Text>{

        FlowBean flowBean = new FlowBean();
        Text t = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //基于统计出来的总流量
            String line = value.toString();
            String[] files = line.split("\t");
            String phone = files[0];
            long upFlow = Long.parseLong(files[1]);
            long dFlow = Long.parseLong(files[2]);

            flowBean.set(upFlow,dFlow);
            t.set(phone);
            //write是序列化，直接写出去了，所以不用担心最终flowBean每次的值都一样的问题
            //输出是按照key排序的
            context.write(flowBean,t);
        }
    }

    static class FlowCountSortReducer extends Reducer<FlowBean,Text,Text,FlowBean> {
        //<bean(),phone>
        @Override
        protected void reduce(FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

            context.write(values.iterator().next(),key);

        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        String os = System.getProperty("os.name").toLowerCase();
        if(os.contains("windows")){
            System.setProperty("HADOOP_USER_NAME","root");
        }

        Configuration conf = new Configuration();

        conf.set("mapreduce.framework.name","local");
        conf.set("mapreduce.jobtracker.address","local");
        conf.set("fs.defaultFS","file:///");

        Job job = Job.getInstance(conf);

        job.setJarByClass(FlowCountSort.class);

        job.setMapperClass(FlowCountSortMapper.class);
        job.setReducerClass(FlowCountSortReducer.class);

        job.setMapOutputKeyClass(FlowBean.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        FileSystem fs = FileSystem.get(new URI("hdfs://mini01:9000"),new Configuration(),"root");
        Path path = new Path("hdfs://mini01:9000/wc/output/flowSort");
        if(fs.exists(path)){
            fs.delete(path,true);
        }

        FileInputFormat.setInputPaths(job, new Path("hdfs://mini01:9000/wc/output/flow1"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://mini01:9000/wc/output/flowSort"));

        boolean res = job.waitForCompletion(true);
        System.exit(res ? 0 : 1);

    }

}

结果如下：

[root@mini01 ~]# hdfs dfs -cat /wc/output/flowSort/*
13502468823     7335    110349  117684
13925057413     11058   48243   59301
13726230503     2481    24681   27162
13726238888     2481    24681   27162
18320173382     9531    2412    11943
13560439658     2034    5892    7926
13660577991     6960    690     7650
15013685858     3659    3538    7197
13922314466     3008    3720    6728
15920133257     3156    2936    6092
84138413        4116    1432    5548
13602846565     1938    2910    4848
18211575961     1527    2106    3633
15989002119     1938    180     2118
13560436666     1116    954     2070
13926435656     132     1512    1644
13480253104     180     180     360
13826544101     264     0       264
13719199419     240     0       240
13760778710     120     120     240
13926251106     240     0       240

四、切片需要注意的问题

关于大量小文件的优化策略：
1）默认情况下，TextInputFormat对任务的切片机制是按照文件规划切片，不管文件多小，都会是一个单独的切片，都会交给一个maptask，这样如果有大量小文件，就会产生大量的maptask，处理效率极其地下
2）优化策略：
最好的办法：在数据处理系统的最前端（预处理/采集），就将小文件先合并成大文件，在上传到HDFS做后续分析。
补救方法：如果已经是大量小文件在hdfs上了，可以使用另外一种InputFormat来做切片（CombineFileInputFormat），它的切片逻辑和FileInputFormat不同，他可以将多个小文件从逻辑上规划到一个切片中，这样，多个小文件就可以交给一个maptask了。

使用方式：
在driver类中

  //如果不设置InputFormat，默认就是使用TextInputFormat.class
        wcjob.setInputFormatClass(CombineFileInputFormat.class);
        CombineFileInputFormat.setMaxInputSplitSize(wcjob,4194304);
        CombineFileInputFormat.setMinInputSplitSize(wcjob,2097152);