将如下数字,分组后找出每一行最小的value
3 2
2 23 1
1 1
3 3
3 1
2 1
--------------------------------------------
1 1
2 1
3 1
为什么要自定义分组类:
1.最开始写mapreduce的时候1.4分组步骤是没指定的,可以认为hadoop默认是有个分组规则的。此时reduce接收的类型是<key1,{value1,value2}>的形式
2.当key2为自定义hadoop类型的时候,hadoop默认的分组规则不起作用了,此时如果不自定义分组规则,reduce接收的类型是<key1,value1>,<key1,value1>的形式。如果还希望接收到的数据是<key1,{value1,value2}>的形式就需要自定义分组类。
Mapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<LongWritable, Text, SortWritable, LongWritable> {
protected void map(LongWritable key, Text value,
org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, SortWritable, LongWritable>.Context context)
throws java.io.IOException, InterruptedException {
String[] splits = value.toString().split("\t");
String first = splits[0];
String second = splits[1];
SortWritable sw = new SortWritable(Long.parseLong(first), Long.parseLong(second));
context.write(sw, new LongWritable(Long.parseLong(second)));
};
}
reduce
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReduce extends Reducer<SortWritable, LongWritable, LongWritable, LongWritable> {
@Override
protected void reduce(SortWritable key2, java.lang.Iterable<LongWritable> values2,
org.apache.hadoop.mapreduce.Reducer<SortWritable, LongWritable, LongWritable, LongWritable>.Context context)
throws java.io.IOException, InterruptedException {
//取最小值
long min = Long.MAX_VALUE;
for (LongWritable v2 : values2) {
if(v2.get()<min){
min = v2.get();
}
}
context.write(new LongWritable(key2.first), new LongWritable(min));
};
}
自定义hadoop类,也就是key2
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class SortWritable implements WritableComparable<SortWritable> {
Long first = 1l;
Long second = 1l;
public SortWritable() {
}
public SortWritable(Long first, Long second) {
this.first = first;
this.second = second;
}
@Override
public void readFields(DataInput in) throws IOException {
this.first = in.readLong();
this.second = in.readLong();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(first);
out.writeLong(second);
}
@Override
public int compareTo(SortWritable o) {
if (this.first == o.first) {
return (int) (this.second - o.second);
}
return (int) (this.first - o.first);
}
@Override
public int hashCode() {
return first.hashCode() + second.hashCode();
}
@Override
public boolean equals(Object obj) {
if (obj instanceof SortWritable) {
SortWritable sw = (SortWritable) obj;
return this.first == sw.first && this.second == sw.second;
}
return false;
}
}
自定义分组类,需要实现RawComparator类泛型写需要分组的自定义hadoop类
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;
public class MyGroupingComparator implements RawComparator<SortWritable> {
@Override
public int compare(SortWritable o1, SortWritable o2) {
return (int)(o1.first - o2.first);
}
/**
* @param arg0 表示第一个参与比较的字节数组
* @param arg1 表示第一个参与比较的字节数组的起始位置
* @param arg2 表示第一个参与比较的字节数组的偏移量
*
* @param arg3 表示第二个参与比较的字节数组
* @param arg4 表示第二个参与比较的字节数组的起始位置
* @param arg5 表示第二个参与比较的字节数组的偏移量
*
* 一个long类型占8字节
*/
@Override
public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);
}
}
测试类
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class SortTest {
private static final String INPUT_PATH = "hdfs://xxc:9000/input";
private static final String OUT_PATH = "hdfs://xxc:9000/out";
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
Path outPath = new Path(OUT_PATH);
if(fileSystem.exists(outPath)){
fileSystem.delete(outPath, true);
}
Job job = new Job(conf,SortTest.class.getSimpleName());
FileInputFormat.setInputPaths(job, INPUT_PATH);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(SortWritable.class);
job.setMapOutputValueClass(LongWritable.class);
//1.3分区
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1);
//1.4分组
job.setGroupingComparatorClass(MyGroupingComparator.class);
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
job.setOutputFormatClass(TextOutputFormat.class);
job.waitForCompletion(true);
}
}