WritableComparator是一个类 这个类是用于mapreduce编程模型中的比较 排序
mapreduce中有两次排序 一次是 在环形缓冲区域之中进行分区 排序
还有一次是数据在reduce端获取文件之后进行分组
现在我讲的是后面那个
//Define the comparator that controls which keys are grouped together or a single call to Reducer#reduce
job.setGroupingComparatorClass(MyComparator.class);
上面是我们在定义job时候进行的配置 配置如何进行分组
setGroupingComparatorClass内部的参数是RawComparator
而WritableComparator是实现RawComparator
所以我们直接继承WritableComparator类就可以自己定义一个MyComparator
public static class MyComparator extends WritableComparator {
public MyComparator() {
super(Text.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
Text a1 = (Text) a;
Text b1 = (Text) b;
if (a1.toString().equals("hello") && b1.toString().equals("hello")) {
return -1;
} else {
return 0;
}
}
}
上面这段代码我必须说一个坑 坑了我好几个小时 最后在statckvoerflow网站上才找到的提示
就是那个无参构造子 必须调用父类的构造子 不然会报空指针 未初始化 buffer
通过查找源码 也确实发现了这个问题
protected WritableComparator(Class<? extends WritableComparable> keyClass,
Configuration conf,
boolean createInstances) {
this.keyClass = keyClass;
this.conf = (conf != null) ? conf : new Configuration();
if (createInstances) {
key1 = newKey();
key2 = newKey();
buffer = new DataInputBuffer();
} else {
key1 = key2 = null;
buffer = null;
}
}
因为从报错的空指针来说 是buffer为空 整个类也就只有这人对buffer进行了初始化
最后来看一下结果
hdfs@yksp005206:/home/jumpserver$ hadoop fs -cat /test/wc/output/part-r-00000
hello value[] hello,
hello value[] hello,
world value[] hello,hellp,hive,kylin,spark,world,
从结果上来看 确实是hello没有被分到一个reduce中 而其他的所有单词都被分配到了同一个reducer中
package com.hit.ee;
/**
* Created by zh on 2017/9/28.
*/
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.StringTokenizer;
public class WorldCount2 {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, Text> {
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, word);
}
}
}
public static class IntSumReducer
extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
Text text = new Text();
StringBuffer sb = new StringBuffer("value[] ");
for (Text value : values) {
sb.append(value).append(",");
}
text.set(sb.toString());
context.write(key, text);
}
}
public static class MyComparator2 implements RawComparator<Text>{
DataInputBuffer buffer = new DataInputBuffer();
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
try {
Text a = new Text();
buffer.reset(b1,s1,l1);
a.readFields(buffer);
Text b = new Text();
buffer.reset(b2,s2,l2);
b.readFields(buffer);
return compare(a,b);
} catch (IOException e) {
e.printStackTrace();
}
return -1;
}
@Override
public int compare(Text a1, Text b1) {
if (a1.toString().equals("hello") && b1.toString().equals("hello")) {
return -1;
} else {
return 0;
}
}
}
public static class MyComparator extends WritableComparator {
public MyComparator() {
super(Text.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
Text a1 = (Text) a;
Text b1 = (Text) b;
if (a1.toString().equals("hello") && b1.toString().equals("hello")) {
return -1;
} else {
return 0;
}
}
}
public static class MyPartitioner extends Partitioner<Text,Text>{
@Override
public int getPartition(Text key, Text value, int numPartitions) {
if (key.toString().equals("hello"))
return 0;
else
return 1;
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem.get(conf).deleteOnExit(new Path(args[1]));
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WorldCount2.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
// job.setNumReduceTasks(2);
// job.setPartitionerClass(MyPartitioner.class);
//Define the comparator that controls how the keys are sorted before they are passed to the reducer
//job.setSortComparatorClass(MyComparator.class);
//Define the comparator that controls which keys are grouped together or a single call to Reducer#reduce
job.setGroupingComparatorClass(MyComparator.class);
job.setMapOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}