mapreduce编程模型之WritableComparator

最新推荐文章于 2024-10-17 11:06:00 发布

原创最新推荐文章于 2024-10-17 11:06:00 发布 · 1.1k 阅读

CC 4.0 BY-SA版权

WritableComparator在MapReduce中用于控制数据的分组排序。通过设置GroupingComparator，如`job.setGroupingComparatorClass(MyComparator.class)`，我们可以自定义分组逻辑。继承WritableComparator并实现RawComparator接口，可以创建自己的比较器，避免空指针异常。最终，自定义比较器会影响数据在Reducer中的分布，例如示例中'hello'未与其他单词分到同一reduce任务。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

WritableComparator是一个类这个类是用于mapreduce编程模型中的比较排序

mapreduce中有两次排序一次是在环形缓冲区域之中进行分区排序

还有一次是数据在reduce端获取文件之后进行分组

现在我讲的是后面那个

//Define the comparator that controls which keys are grouped together or a single call to Reducer#reduce
job.setGroupingComparatorClass(MyComparator.class);

上面是我们在定义job时候进行的配置配置如何进行分组

setGroupingComparatorClass内部的参数是RawComparator

而WritableComparator是实现RawComparator

所以我们直接继承WritableComparator类就可以自己定义一个MyComparator

public static class MyComparator extends WritableComparator {
        public MyComparator() {
            super(Text.class,true);
        }

        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            Text a1 = (Text) a;
            Text b1 = (Text) b;
            if (a1.toString().equals("hello") && b1.toString().equals("hello")) {
                return -1;
            } else {
                return 0;
            }

        }
    }

上面这段代码我必须说一个坑坑了我好几个小时最后在statckvoerflow网站上才找到的提示

就是那个无参构造子必须调用父类的构造子不然会报空指针未初始化 buffer

通过查找源码也确实发现了这个问题

  protected WritableComparator(Class<? extends WritableComparable> keyClass,
                               Configuration conf,
                               boolean createInstances) {
    this.keyClass = keyClass;
    this.conf = (conf != null) ? conf : new Configuration();
    if (createInstances) {
      key1 = newKey();
      key2 = newKey();
      buffer = new DataInputBuffer();
    } else {
      key1 = key2 = null;
      buffer = null;
    }
  }

因为从报错的空指针来说是buffer为空整个类也就只有这人对buffer进行了初始化

最后来看一下结果

hdfs@yksp005206:/home/jumpserver$ hadoop fs -cat /test/wc/output/part-r-00000
hello value[] hello,
hello value[] hello,
world value[] hello,hellp,hive,kylin,spark,world,

从结果上来看确实是hello没有被分到一个reduce中而其他的所有单词都被分配到了同一个reducer中

package com.hit.ee;

/**
 * Created by zh on 2017/9/28.
 */

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.StringTokenizer;

public class WorldCount2 {

    public static class TokenizerMapper
            extends Mapper<Object, Text, Text, Text> {

        private Text word = new Text();

        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, word);
            }
        }
    }

    public static class IntSumReducer
            extends Reducer<Text, Text, Text, Text> {

        public void reduce(Text key, Iterable<Text> values,
                           Context context
        ) throws IOException, InterruptedException {
            Text text = new Text();
            StringBuffer sb = new StringBuffer("value[] ");
            for (Text value : values) {
                sb.append(value).append(",");
            }
            text.set(sb.toString());
            context.write(key, text);
        }
    }

    public static class MyComparator2 implements RawComparator<Text>{

        DataInputBuffer buffer = new DataInputBuffer();

        @Override
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {

            try {
                Text a = new Text();
                buffer.reset(b1,s1,l1);
                a.readFields(buffer);

                Text b = new Text();
                buffer.reset(b2,s2,l2);
                b.readFields(buffer);
                return compare(a,b);

            } catch (IOException e) {
                e.printStackTrace();
            }

            return -1;
        }


        @Override
        public int compare(Text a1, Text b1) {
            if (a1.toString().equals("hello") && b1.toString().equals("hello")) {
                return -1;
            } else {
                return 0;
            }
        }
    }

    public static class MyComparator extends WritableComparator {
        public MyComparator() {
            super(Text.class,true);
        }

        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            Text a1 = (Text) a;
            Text b1 = (Text) b;
            if (a1.toString().equals("hello") && b1.toString().equals("hello")) {
                return -1;
            } else {
                return 0;
            }

        }
    }

    public static class MyPartitioner extends Partitioner<Text,Text>{

        @Override
        public int getPartition(Text key, Text value, int numPartitions) {
            if (key.toString().equals("hello"))
                return 0;
            else
                return 1;
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        FileSystem.get(conf).deleteOnExit(new Path(args[1]));

        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WorldCount2.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setReducerClass(IntSumReducer.class);

//        job.setNumReduceTasks(2);
//        job.setPartitionerClass(MyPartitioner.class);
        //Define the comparator that controls how the keys are sorted before they are passed to the reducer
        //job.setSortComparatorClass(MyComparator.class);
        //Define the comparator that controls which keys are grouped together or a single call to Reducer#reduce
        job.setGroupingComparatorClass(MyComparator.class);


        job.setMapOutputValueClass(Text.class);
        job.setMapOutputKeyClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));


        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}