排序是MapReduce的核心技术,排序分为部分排序,全排序和二次排序。
部分排序:调用默认的HashPartitioner,不需要操作,每个reduce聚合的key都是有序的。
全排序:对reduce输出的所有的key实现排序
方法1:设置一个reducde
方法2:自定义分区类实现全排序
方法3 :使用采样
二次排序: key排完序,再次基础上进行二次排序
以统计每年的最高气温为例进行示例:
注意:源文件是一个sequenceFile序列文件<IntWritable, IntWritable>
1、MaxTempMapper
- package hadoop.mr.sort.total.totalorder;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.mapreduce.Mapper;
- import java.io.IOException;
- /**
- * MaxTempMapper
- */
- public class MaxTempMapper extends Mapper<IntWritable, IntWritable, IntWritable, IntWritable> {
- protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
- context.write(key,value);
- }
- }
- package hadoop.mr.sort.total.totalorder;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.mapreduce.Reducer;
- import java.io.IOException;
- /**
- */
- public class MaxTempReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
- protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
- int max = Integer.MIN_VALUE ;
- for(IntWritable iw : values){
- max = max > iw.get() ? max : iw.get() ;
- }
- context.write(key,new IntWritable(max));
- }
- }
3、App
- package hadoop.mr.sort.total.totalorder;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
- import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
- /**
- */
- public class App {
- public static void main(String[] args) throws Exception {
- args = new String[]{"d:/java/mr/data/temp.seq", "d:/java/mr/out"};
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- if(fs.exists(new Path(args[1]))){
- fs.delete(new Path(args[1]),true);
- }
- Job job = Job.getInstance(conf);
- job.setJobName("maxTemp");
- job.setJarByClass(App.class);
- job.setMapperClass(MaxTempMapper.class);
- job.setReducerClass(MaxTempReducer.class);
- FileInputFormat.addInputPath(job,new Path(args[0]));
- FileOutputFormat.setOutputPath(job,new Path(args[1]));
- //设置combine输入格式
- job.setInputFormatClass(SequenceFileInputFormat.class);
- job.setPartitionerClass(TotalOrderPartitioner.class);
- job.setNumReduceTasks(3);
- job.setMapOutputKeyClass(IntWritable.class);
- job.setMapOutputValueClass(IntWritable.class);
- job.setOutputKeyClass(IntWritable.class);
- job.setOutputValueClass(IntWritable.class);
- TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("file:///d:/java/mr/par.seq"));
- //随机采样器
- InputSampler.RandomSampler<IntWritable,IntWritable> r = new InputSampler.RandomSampler<IntWritable, IntWritable>(1f,5,3);
- //创建分区文件
- InputSampler.writePartitionFile(job,r);
- job.waitForCompletion(true);
- }
- }