MapReduce排序

最新推荐文章于 2023-12-26 20:19:34 发布

蓝星花

最新推荐文章于 2023-12-26 20:19:34 发布

阅读量531

点赞数

CC 4.0 BY-SA版权

分类专栏：大数据文章标签： hadoop排序

本文链接：https://blog.youkuaiyun.com/m0_37499059/article/details/79934300

大数据专栏收录该内容

25 篇文章

订阅专栏

排序是MapReduce的核心技术，排序分为部分排序，全排序和二次排序。

部分排序：调用默认的HashPartitioner，不需要操作，每个reduce聚合的key都是有序的。

全排序：对reduce输出的所有的key实现排序

方法1：设置一个reducde

方法2：自定义分区类实现全排序

方法3 ：使用采样

二次排序： key排完序，再次基础上进行二次排序

以统计每年的最高气温为例进行示例：

注意：源文件是一个sequenceFile序列文件<IntWritable, IntWritable>

1、MaxTempMapper

[java]view plain copy
package hadoop.mr.sort.total.totalorder;  
  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.mapreduce.Mapper;  
  
import java.io.IOException;  
  
/** 
 * MaxTempMapper 
 */  
public class MaxTempMapper extends Mapper<IntWritable, IntWritable, IntWritable, IntWritable> {  
  
    protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {  
        context.write(key,value);  
    }  
}  

2、MaxTempReducer

[java]view plain copy
package hadoop.mr.sort.total.totalorder;  
  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.mapreduce.Reducer;  
  
import java.io.IOException;  
  
/** 
 */  
public class MaxTempReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{  
    protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {  
        int max = Integer.MIN_VALUE ;  
        for(IntWritable iw : values){  
            max = max > iw.get() ? max : iw.get() ;  
        }  
        context.write(key,new IntWritable(max));  
    }  
}  

3、App

[java]view plain copy
package hadoop.mr.sort.total.totalorder;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;  
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;  
  
/** 
 */  
public class App {  
    public static void main(String[] args) throws Exception {  
        args = new String[]{"d:/java/mr/data/temp.seq", "d:/java/mr/out"};  
        Configuration conf = new Configuration();  
        FileSystem fs = FileSystem.get(conf);  
        if(fs.exists(new Path(args[1]))){  
            fs.delete(new Path(args[1]),true);  
        }  
  
        Job job = Job.getInstance(conf);  
  
        job.setJobName("maxTemp");  
        job.setJarByClass(App.class);  
  
        job.setMapperClass(MaxTempMapper.class);  
        job.setReducerClass(MaxTempReducer.class);  
  
        FileInputFormat.addInputPath(job,new Path(args[0]));  
        FileOutputFormat.setOutputPath(job,new Path(args[1]));  
        //设置combine输入格式  
        job.setInputFormatClass(SequenceFileInputFormat.class);  
        job.setPartitionerClass(TotalOrderPartitioner.class);  
  
        job.setNumReduceTasks(3);  
  
        job.setMapOutputKeyClass(IntWritable.class);  
        job.setMapOutputValueClass(IntWritable.class);  
  
        job.setOutputKeyClass(IntWritable.class);  
        job.setOutputValueClass(IntWritable.class);  
  
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("file:///d:/java/mr/par.seq"));  
        //随机采样器  
        InputSampler.RandomSampler<IntWritable,IntWritable> r = new InputSampler.RandomSampler<IntWritable, IntWritable>(1f,5,3);  
        //创建分区文件  
        InputSampler.writePartitionFile(job,r);  
  
        job.waitForCompletion(true);  
    }  
}