11. MapReduce实现序列化，自定义combiner和自定义partitioner

原创已于 2024-10-29 11:15:57 修改 · 366 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#mapreduce 序列化

于 2020-03-11 23:41:01 首次发布

hadoop知识笔记专栏收录该内容

15 篇文章

订阅专栏

一. 序列化概述

1.什么是序列化
序列化就是将对象转换为字节序列以便于存储到磁盘或网络传输。
反序列化就是将字节序列转换为对象的过程。

2.为什么要序列化
程序中的对象不能直接网络传输或者持久化，所以在跨主机通信和数据持久化的场景下就需要用到序列化。

二. MapReduce序列化

java原生序列化是一个重量级的实现，一个对象被序列化后会附带很多额外的信息(各种校验信息，Header,继承体系)，不便于持久化和网络传输。所以Hadoop自己实现了一套序列化方案。

数据类型	序列化实现
byte	ByteWritable
short	ShortWritable
int	IntWritable
long	LongWritable
float	FloatWritable
double	DoubleWritable
String	Text
null	NullWritable

hadoop 自定义序列化
如果需要将自定义的bean作为 key，则要实现WritableComparable接口，因为 Writerable 接口不具备比较功能，而MapReduce框中的shuffle过程一定会对key进行排序，所以还需要指定排序规则，此时，自定义的bean 实现的接口应该是：

public class ClassName implements WritableComparable {
	// 序列化
	write() {
	}
	// 反序列化
	readFileds() {
	}
	// 排序方法
	compareTo(o) {
	}
}

如果自定义的bean只是作为value,那么只需要实现Writable接口即可

public class ClassName implements WritableComparable {
	// 序列化 
	write() {
	}
	
	// 反序列化
	readFileds() {
	}
}

三. MapReduce序列化案例

需求

统计每一个手机号耗费的总上行流量、下行流量、总流量，并按照总流量大小倒序排序
将流量汇总统计结果按照手机归属地不同省份输出到不同文件中

输入数据格式

手机号码,上行流量,下行流量
13881743089,100,34300
13655669078,34434,300
......

期望输出格式

手机号码,总上行流量,总下行流量,总流量
13881743089,4540,39300,43840‬
......

实现代码

package mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

class FlowBean implements WritableComparable<FlowBean> {

    private String phoneNumber; // 手机号码
    private long upFlow;  // 上行流量
    private long downFlow;  // 下行流量
    private long sumFlow;  // 总流量


    public String getPhoneNumber() {
        return phoneNumber;
    }

    public void setPhoneNumber(String phoneNumber) {
        this.phoneNumber = phoneNumber;
    }

    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    public FlowBean() {
    }

    public FlowBean(String phoneNumber, long upFlow, long downFlow) {
        this.phoneNumber = phoneNumber;
        this.upFlow = upFlow;
        this.downFlow = downFlow;
        this.sumFlow = this.upFlow + this.downFlow;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(phoneNumber);
        out.writeLong(upFlow);
        out.writeLong(downFlow);
        out.writeLong(sumFlow);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.phoneNumber = in.readUTF();
        this.upFlow = in.readLong();
        this.downFlow = in.readLong();
        this.sumFlow = in.readLong();
    }

    @Override
    public int compareTo(FlowBean o) {
        return (int)(o.getSumFlow() - this.sumFlow);
    }

    @Override
    public String toString() {
        return phoneNumber + "," + upFlow + "," + downFlow +
                "," + sumFlow;
    }
}


public class FlowMR {

    static class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] fields = line.split(",");

            String phoneNumber = fields[0];
            long upFlow = Long.parseLong(fields[1]);
            long downFlow = Long.parseLong(fields[2]);
            FlowBean flowBean = new FlowBean(phoneNumber, upFlow, downFlow);

            context.write(new Text(phoneNumber), flowBean);
        }
    }


    static class FlowReducer extends Reducer<Text, FlowBean, NullWritable, FlowBean> {
        @Override
        protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws InterruptedException, IOException {
            long upFlowSum = 0;
            long downFlowSum = 0;

            for (FlowBean flowBean : values) {
                upFlowSum += flowBean.getUpFlow();
                downFlowSum += flowBean.getDownFlow();
            }

            FlowBean res = new FlowBean(key.toString(), upFlowSum, downFlowSum);

            context.write(NullWritable.get(), res);
        }
    }

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(FlowMR.class);
        job.setJobName("flowMR");

        //设置文件输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //设置Mapper
        job.setMapperClass(FlowMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        //设置Reducer
        job.setReducerClass(FlowReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(FlowBean.class);

        job.setNumReduceTasks(1);

        job.waitForCompletion(true);
    }
}

pom.xml

<dependency>
	<groupId>org.apache.hadoop</groupId>
	<artifactId>hadoop-client</artifactId>
	<version>3.2.4</version>
</dependency>
<dependency>
	<groupId>org.apache.hadoop</groupId>
	<artifactId>hadoop-common</artifactId>
	<version>3.2.4</version>
</dependency>

输入文件

[root@hadoop1 ~]# hdfs dfs -text /test/flow.txt
13881743089,100,34300
13655669078,34434,300
18677563354,3443,3209
13881743089,109,3300
13655669078,3434,230

打包，并提交到集群运行

[root@hadoop1 ~]# hadoop jar  learn-1.0-SNAPSHOT.jar  mr.FlowMR /test/flow.txt /output

结果文件

[root@hadoop1 ~]# hdfs dfs -text /output/part-r-00000
13655669078,37868,530,38398
13881743089,209,37600,37809
18677563354,3443,3209,6652

四. 自定义combiner

在MapReduce的第一个阶段maptask执行过程中，每一个 maptask 都可能会产生大量的本地输出，combiner是MapReduce程序中mapper和reducer 之外的一种组件，它的作用是在maptask之后给maptask的结果进行局部汇总，是一个本地化的reduce操作，主要是在maptask计算得到结果文件前做一个简单的合并重复key值的操作以减轻reducetask的计算负载，减少网络传输。

以wordcount为例，combiner会将map阶段后生成的单词记数先进行一次本地合并。
在这里插入图片描述

注意事项:
并不是所有的Job都适用 combiner ，只有操作满足结合律的才可设置combiner，combiner操作类似于：opt(opt(1, 2, 3), opt(4, 5, 6))。如果opt为求和、求最大值的话，可以使用，但是如果是求中
值，求平均值则不适用。

如何自定义combiner
combiner 和reducer一样，需要编写一个类，然后继承reducer，reduce方法中写具体的combiner逻辑，然后在job中设置combiner组件。

实现代码

package mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

class FlowBean implements WritableComparable<FlowBean> {

    private String phoneNumber; // 手机号码
    private long upFlow;  // 上行流量
    private long downFlow;  // 下行流量
    private long sumFlow;  // 总流量


    public String getPhoneNumber() {
        return phoneNumber;
    }

    public void setPhoneNumber(String phoneNumber) {
        this.phoneNumber = phoneNumber;
    }

    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    public FlowBean() {
    }

    public FlowBean(String phoneNumber, long upFlow, long downFlow) {
        this.phoneNumber = phoneNumber;
        this.upFlow = upFlow;
        this.downFlow = downFlow;
        this.sumFlow = this.upFlow + this.downFlow;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(phoneNumber);
        out.writeLong(upFlow);
        out.writeLong(downFlow);
        out.writeLong(sumFlow);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.phoneNumber = in.readUTF();
        this.upFlow = in.readLong();
        this.downFlow = in.readLong();
        this.sumFlow = in.readLong();
    }

    @Override
    public int compareTo(FlowBean o) {
        return (int)(o.getSumFlow() - this.sumFlow);
    }

    @Override
    public String toString() {
        return phoneNumber + "," + upFlow + "," + downFlow +
                "," + sumFlow;
    }
}


public class FlowMR {

    static class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] fields = line.split(",");

            String phoneNumber = fields[0];
            long upFlow = Long.parseLong(fields[1]);
            long downFlow = Long.parseLong(fields[2]);
            FlowBean flowBean = new FlowBean(phoneNumber, upFlow, downFlow);

            context.write(new Text(phoneNumber), flowBean);
        }
    }


    static class FlowCombiner extends Reducer<Text, FlowBean, Text, FlowBean> {
        protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws InterruptedException, IOException {
            long upFlowSum = 0;
            long downFlowSum = 0;

            for (FlowBean flowBean : values) {
                upFlowSum += flowBean.getUpFlow();
                downFlowSum += flowBean.getDownFlow();
            }

            FlowBean combiner = new FlowBean(key.toString(), upFlowSum, downFlowSum);

            context.write(key, combiner);
        }
    }

    static class FlowReducer extends Reducer<Text, FlowBean, NullWritable, FlowBean> {
        @Override
        protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws InterruptedException, IOException {
            long upFlowSum = 0;
            long downFlowSum = 0;

            for (FlowBean flowBean : values) {
                upFlowSum += flowBean.getUpFlow();
                downFlowSum += flowBean.getDownFlow();
            }

            FlowBean res = new FlowBean(key.toString(), upFlowSum, downFlowSum);

            context.write(NullWritable.get(), res);
        }
    }

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(FlowMR.class);
        job.setJobName("flowMR");

        //设置文件输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //设置Mapper
        job.setMapperClass(FlowMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        // 设置Combiner
        job.setCombinerClass(FlowCombiner.class);

        //设置Reducer
        job.setReducerClass(FlowReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(FlowBean.class);

        job.setNumReduceTasks(1);

        job.waitForCompletion(true);
    }
}

比较两次运行的输出日志，发现使用combiner后，在输入数据和输出数据不变的条件下，shuffle的数据量是减少的。
在这里插入图片描述

五. 自定义Partitioner

前面的例子中，最终的结果文件都是输出在一个文件中，需求2需要将流量汇总统计结果按照手机归属地不同省份输出到不同文件中

MapReduce会将map输出的kv对，按照相同key分组，然后分发给不同的reducetask，默认的分区组件HashPartitioner。

public class HashPartitioner<K, V> extends Partitioner<K, V> {
  public int getPartition(K key, V value,
                          int numReduceTasks) {
    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }
}

它的分发规则为：根据key的 hashcode % reducetask 数来分发。

如果要按照我们自己的需求进行分组，则需要重新实现自己的Partitioner，自定义分区根据手机号码前三位进行分区，具体规则如下：

"136" -> 0号分区
"138" -> 1号分区
"186" -> 2号分区
"其他" -> 3号分区

实现代码

package mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

class FlowBean implements WritableComparable<FlowBean> {

    private String phoneNumber; // 手机号码
    private long upFlow;  // 上行流量
    private long downFlow;  // 下行流量
    private long sumFlow;  // 总流量


    public String getPhoneNumber() {
        return phoneNumber;
    }

    public void setPhoneNumber(String phoneNumber) {
        this.phoneNumber = phoneNumber;
    }

    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    public FlowBean() {
    }

    public FlowBean(String phoneNumber, long upFlow, long downFlow) {
        this.phoneNumber = phoneNumber;
        this.upFlow = upFlow;
        this.downFlow = downFlow;
        this.sumFlow = this.upFlow + this.downFlow;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(phoneNumber);
        out.writeLong(upFlow);
        out.writeLong(downFlow);
        out.writeLong(sumFlow);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.phoneNumber = in.readUTF();
        this.upFlow = in.readLong();
        this.downFlow = in.readLong();
        this.sumFlow = in.readLong();
    }

    @Override
    public int compareTo(FlowBean o) {
        return (int)(o.getSumFlow() - this.sumFlow);
    }

    @Override
    public String toString() {
        return phoneNumber + "," + upFlow + "," + downFlow +
                "," + sumFlow;
    }
}


public class FlowMR {

    static class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] fields = line.split(",");

            String phoneNumber = fields[0];
            long upFlow = Long.parseLong(fields[1]);
            long downFlow = Long.parseLong(fields[2]);
            FlowBean flowBean = new FlowBean(phoneNumber, upFlow, downFlow);

            context.write(new Text(phoneNumber), flowBean);
        }
    }

    static class FlowReducer extends Reducer<Text, FlowBean, NullWritable, FlowBean> {
        @Override
        protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws InterruptedException, IOException {
            long upFlowSum = 0;
            long downFlowSum = 0;

            for (FlowBean flowBean : values) {
                upFlowSum += flowBean.getUpFlow();
                downFlowSum += flowBean.getDownFlow();
            }

            FlowBean res = new FlowBean(key.toString(), upFlowSum, downFlowSum);

            context.write(NullWritable.get(), res);
        }
    }

    static class PhonePartitioner extends Partitioner<Text, FlowBean> {

        private static Map<String, Integer> phoneProvinceMap = new HashMap<String, Integer>();

        /**
         *  "136" -> 0号分区
         *  "138" -> 1号分区
         *  "186" -> 2号分区
         *  "其他" -> 3号分区
         *  一共4个分区
         */
        static {
            phoneProvinceMap.put("136", 0);
            phoneProvinceMap.put("138", 1);
            phoneProvinceMap.put("186", 2);
        }
        @Override
        public int getPartition(Text text, FlowBean flowBean, int numPartitions) {
            String phonePrefix = flowBean.getPhoneNumber().substring(0,3);
            if (phoneProvinceMap.containsKey(phonePrefix)) {
                return phoneProvinceMap.get(phonePrefix);
            } else {
                return 3;
            }
        }
    }

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(FlowMR.class);
        job.setJobName("flowMR");

        //设置文件输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //设置Mapper
        job.setMapperClass(FlowMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        // 设置partitioner
        job.setPartitionerClass(PhonePartitioner.class);

        //设置Reducer
        job.setReducerClass(FlowReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(FlowBean.class);

        // 设置总的分区数为4
        job.setNumReduceTasks(4);

        job.waitForCompletion(true);
    }
}

运行结束后，一共生成了4个结果文件，查看所有结果文件

[root@hadoop2 ~]# hdfs dfs -text /output/part-r-00000
13655669078,37868,530,38398
[root@hadoop2 ~]# hdfs dfs -text /output/part-r-00001
13881743089,209,37600,37809
[root@hadoop2 ~]# hdfs dfs -text /output/part-r-00002
18677563354,3443,3209,6652
[root@hadoop2 ~]# hdfs dfs -text /output/part-r-00003