转自:http://blog.youkuaiyun.com/heyutao007/article/details/5890165
书上的例子是为了取出一年当中气温最高的值,那么将年份和气温做了一个复合的key.
1 通过设置了partitioner来进行分区。因为分区是按照年份来进行,所以同年的数据就可以分区到一个reducer中。
2 自定义key比较器,按照年份升序,温度值降序。这样map输出的所有kv对就是按照年份升序,温度值降序排列的。
3 自定义分组比较器,所有同一年的数据属于同一个组,那么在reduce输出的时候,只需要取第一个value就能达到输出一年最高气温的目的。
代码:
- package temperature;
- import java.io.DataInput;
- import java.io.DataOutput;
- import java.io.IOException;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.WritableComparable;
- import org.apache.hadoop.io.WritableComparator;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Partitioner;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
- public class Temperature {
- // 自己定义的key类应该实现WritableComparable接口
- public static class IntPair implements WritableComparable<IntPair> {
- int first;
- int second;
- /**
- * Set the left and right values.
- */
- public void set(int left, int right) {
- first = left;
- second = right;
- }
- public int getFirst() {
- return first;
- }
- public int getSecond() {
- return second;
- }
- @Override
- // 反序列化
- public void readFields(DataInput in) throws IOException {
- // TODO Auto-generated method stub
- first = in.readInt();
- second = in.readInt();
- }
- @Override
- // 序列化
- public void write(DataOutput out) throws IOException {
- // TODO Auto-generated method stub
- out.writeInt(first);
- out.writeInt(second);
- }
- @Override
- // key的比较
- public int compareTo(IntPair o) {
- // TODO Auto-generated method stub
- if (first != o.first) {
- return first < o.first ? -1 : 1;
- } else if (second != o.second) {
- return second < o.second ? -1 : 1;
- } else {
- return 0;
- }
- }
- // 新定义类应该重写的两个方法
- @Override
- public int hashCode() {
- return first * 157 + second;
- }
- @Override
- public boolean equals(Object right) {
- if (right == null)
- return false;
- if (this == right)
- return true;
- if (right instanceof IntPair) {
- IntPair r = (IntPair) right;
- return r.first == first && r.second == second;
- } else {
- return false;
- }
- }
- }
- /**
- * 分区函数类。根据first确定Partition。
- */
- public static class FirstPartitioner extends
- Partitioner<IntPair, NullWritable> {
- @Override
- public int getPartition(IntPair key, NullWritable value,
- int numPartitions) {
- return Math.abs(key.getFirst() * 127) % numPartitions;
- }
- }
- /**
- * key比较函数类。first升序,second降序。
- */
- public static class KeyComparator extends WritableComparator {
- protected KeyComparator() {
- super(IntPair.class, true);
- }
- @Override
- public int compare(WritableComparable w1, WritableComparable w2) {
- IntPair ip1 = (IntPair) w1;
- IntPair ip2 = (IntPair) w2;
- int l = ip1.getFirst();
- int r = ip2.getFirst();
- int cmp = (l == r ? 0 : (l < r ? -1 : 1));
- if (cmp != 0) {
- return cmp;
- }
- l = ip1.getSecond();
- r = ip2.getSecond();
- return l == r ? 0 : (l < r ? 1 : -1); // reverse
- }
- }
- /**
- * 分组函数类。属于同一个组的value会放到同一个迭代器中,而比较是否是同一组需要使用GroupingComparator比较器。
- */
- // 第二种方法,继承WritableComparator
- public static class GroupingComparator extends WritableComparator {
- protected GroupingComparator() {
- super(IntPair.class, true);
- }
- @Override
- // Compare two WritableComparables.
- public int compare(WritableComparable w1, WritableComparable w2) {
- IntPair ip1 = (IntPair) w1;
- IntPair ip2 = (IntPair) w2;
- int l = ip1.getFirst();
- int r = ip2.getFirst();
- return l == r ? 0 : (l < r ? -1 : 1);
- }
- }
- // 自定义map
- public static class Map extends
- Mapper<LongWritable, Text, IntPair, NullWritable> {
- private final IntPair intkey = new IntPair();
- public void map(LongWritable key, Text value, Context context)
- throws IOException, InterruptedException {
- String line = value.toString();
- StringTokenizer tokenizer = new StringTokenizer(line);
- int left = 0;
- int right = 0;
- if (tokenizer.hasMoreTokens()) {
- left = Integer.parseInt(tokenizer.nextToken());
- if (tokenizer.hasMoreTokens())
- right = Integer.parseInt(tokenizer.nextToken());
- intkey.set(left, right);
- context.write(intkey, NullWritable.get());
- }
- }
- }
- // 自定义reduce
- //
- public static class Reduce extends
- Reducer<IntPair, NullWritable, IntWritable, IntWritable> {
- private final IntWritable left = new IntWritable();
- private final IntWritable right = new IntWritable();
- public void reduce(IntPair key, Iterable<NullWritable> values,
- Context context) throws IOException, InterruptedException {
- left.set(key.getFirst());
- right.set(key.getSecond());
- context.write(left, right);
- }
- }
- /**
- * @param args
- */
- public static void main(String[] args) throws IOException,
- InterruptedException, ClassNotFoundException {
- // TODO Auto-generated method stub
- // 读取hadoop配置
- Configuration conf = new Configuration();
- // 实例化一道作业
- Job job = new Job(conf, "temperature");
- job.setJarByClass(Temperature.class);
- // Mapper类型
- job.setMapperClass(Map.class);
- // 不再需要Combiner类型,因为Combiner的输出类型<Text,
- // IntWritable>对Reduce的输入类型<IntPair, IntWritable>不适用
- // job.setCombinerClass(Reduce.class);
- // Reducer类型
- job.setReducerClass(Reduce.class);
- // 分区函数
- job.setPartitionerClass(FirstPartitioner.class);
- // key比较函数
- job.setSortComparatorClass(KeyComparator.class);
- // 分组函数
- job.setGroupingComparatorClass(GroupingComparator.class);
- // map 输出Key的类型
- job.setMapOutputKeyClass(IntPair.class);
- // map输出Value的类型
- job.setMapOutputValueClass(NullWritable.class);
- // rduce输出Key的类型,是Text,因为使用的OutputFormatClass是TextOutputFormat
- job.setOutputKeyClass(IntWritable.class);
- // rduce输出Value的类型
- job.setOutputValueClass(IntWritable.class);
- // 将输入的数据集分割成小数据块splites,同时提供一个RecordReder的实现。
- job.setInputFormatClass(TextInputFormat.class);
- // 提供一个RecordWriter的实现,负责数据输出。
- job.setOutputFormatClass(TextOutputFormat.class);
- // 输入hdfs路径
- FileInputFormat.setInputPaths(job, new Path(args[0]));
- // 输出hdfs路径
- FileOutputFormat.setOutputPath(job, new Path(args[1]));
- // 提交job
- System.exit(job.waitForCompletion(true) ? 0 : 1);
- }
- }