MapReduce排序

一、键排序

原数据:两列分别是品牌、销售额

Hino	3153
Toyota	177649
Buick	296183
Cadillac	20116
Audi	121804
Skoda	33554
VW	237156
Nissan	259545
Hyundai	240702
Kia	135666
Ford	403640
Fiat	48375
CIIMO	15087
Everus	13913
Honda	119542
Citroen	158735
Peugeot	33242
Suzuki	33244

排序后结果:



在shuffle(洗牌)过程中,会将map的输出结果按照key进行排序,所以只需要将Bean作为map输出的key值,前提是

Bean实现了Comparable接口。在hadoop中既实现Writable接口,又实现Comparable接口,可以简写为实现了

WritableComparable接口。


Bean.java

import java.io.DataInput;  
import java.io.DataOutput;  
import java.io.IOException;  

import org.apache.hadoop.io.WritableComparable;  

public class Bean implements WritableComparable<Bean> {  
  
    private String carName;  
    private long sum;  
  
    public Bean() {		
    }  
    public Bean(String carName, long sum) { 
        this.carName = carName;  
        this.sum = sum;  
    }  
    @Override  
    public void write(DataOutput out) throws IOException {  
        out.writeUTF(carName);  
        out.writeLong(sum);  
    }  
    @Override  
    public void readFields(DataInput in) throws IOException {  
        this.carName = in.readUTF();  
        this.sum = in.readLong();  
    }  
	
	public String getCarName() {
		return carName;
	}
	public void setCarName(String carName) {
		this.carName = carName;
	}
	public long getSum() {
		return sum;
	}
	public void setSum(long sum) {
		this.sum = sum;
	}  
  
    @Override  
    public String toString() {  
        return "" + sum;  
    }  
    @Override  
    public int compareTo(Bean o) {  
        return this.sum > o.sum ? -1 : 1;  
    }  
}  

SortMapReduce.java

import java.io.IOException;  
  
import org.apache.commons.lang.StringUtils;  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.NullWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  
public class SortMapReduce {  
  
    public static class SortMapper extends  
            Mapper<LongWritable, Text, Bean, NullWritable> {  
        @Override  
        protected void map(  
                LongWritable k1,  
                Text v1,  
                Mapper<LongWritable, Text, Bean, NullWritable>.Context context)  
                throws IOException, InterruptedException {  
              
            String line = v1.toString();  
            String[] fields = StringUtils.split(line, "\t");  
            String carName = fields[0];
            long sum = Long.parseLong(fields[1]);  
  
            context.write(new Bean(carName,sum),NullWritable.get());  
        }  
    }  
  
    public static class SortReducer extends  
            Reducer<Bean, NullWritable, Text, Bean> {  
        @Override  
        protected void reduce(Bean k2, Iterable<NullWritable> v2s,  
                Reducer<Bean, NullWritable, Text, Bean>.Context context)  
                throws IOException, InterruptedException {  
				
            String carName = k2.getCarName();  
            context.write(new Text(carName), k2);  
			
        }  
    }  
  
    public static void main(String[] args) throws IOException,  
            ClassNotFoundException, InterruptedException {  
  
        Configuration conf = new Configuration();  
        Job job = Job.getInstance(conf);  
  
        job.setJarByClass(SortMapReduce.class);  
  
        job.setMapperClass(SortMapper.class);  
        job.setReducerClass(SortReducer.class);  
  
        job.setMapOutputKeyClass(Bean.class);  
        job.setMapOutputValueClass(NullWritable.class);  
  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(Bean.class);  
  
        FileInputFormat.setInputPaths(job, new Path(args[0]));  
        FileOutputFormat.setOutputPath(job, new Path(args[1]));  
  
        System.exit(job.waitForCompletion(true) ? 0 : 1);  
    }  
}  

二、二次排序

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class SecondSort  extends Configured implements Tool{

	public static void main(String[] args) throws Exception {
		ToolRunner.run(new SecondSort(), args);
	}

	@Override
	public int run(String[] args) throws Exception {
		Configuration conf = getConf();
		@SuppressWarnings("deprecation")
		Job job = new Job(conf);
		job.setJarByClass(getClass());
		
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.setMapperClass(SortMapper.class);
		job.setReducerClass(SortReducer.class);
		
		job.setOutputKeyClass(MyPairWritable.class);
		job.setOutputValueClass(NullWritable.class);
		
		job.setSortComparatorClass(PairKeyComparator.class);
		
		job.waitForCompletion(true);
		return 0;
	}
}

class SortMapper extends Mapper<LongWritable, Text, MyPairWritable, NullWritable>{
	MyPairWritable pair= new MyPairWritable();
	protected void map(LongWritable key, Text value, Context context) throws java.io.IOException ,InterruptedException {		
		String[] strs = value.toString().split(" ");
		Text keyy = new Text(strs[0]);
		IntWritable valuee = new IntWritable(Integer.parseInt(strs[1]));
		pair.set(keyy, valuee);
		context.write(pair, NullWritable.get());
	};
}

class SortReducer extends Reducer<MyPairWritable, NullWritable,MyPairWritable, NullWritable>{
	protected void reduce(MyPairWritable key, java.lang.Iterable<NullWritable> values, Context context) throws IOException ,InterruptedException {
		context.write(key, NullWritable.get());
		
	};
}

class PairKeyComparator extends WritableComparator{

	public  PairKeyComparator() {
		super(MyPairWritable.class,true);
	}
	@SuppressWarnings("rawtypes")
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		
		MyPairWritable p1 = (MyPairWritable)a;
		MyPairWritable p2 = (MyPairWritable)b;
		if(!p1.getFirst().toString().equals(p2.getFirst().toString())){
			return p1.first.toString().compareTo(p2.first.toString());
		}else {
			return p1.getSecond().get() - p2.getSecond().get();
		}
	}
}

class MyPairWritable implements WritableComparable<MyPairWritable>{
	Text first;
	IntWritable second;
	
    public void set(Text first, IntWritable second){
        this.first = first;
        this.second = second;
    }
    public Text getFirst(){
        return first;
    }
    public IntWritable getSecond(){
        return second;
    }
	
	@Override
	public void readFields(DataInput in) throws IOException {
		first = new Text(in.readUTF());
		second = new IntWritable(in.readInt());
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(first.toString());
		out.writeInt(second.get());
	}

	@Override
	public int compareTo(MyPairWritable o) {
		if(this.first != o.getFirst()){
			return this.first.toString().compareTo(o.first.toString());
		}else if(this.second != o.getSecond()){
			return this.second.get() - o.getSecond().get();
		}
		else	return 0;
	}
	
	@Override
	public String toString() {
		return first.toString() + " " + second.get();
	}
	
	@Override
	public boolean equals(Object obj) {
		MyPairWritable temp = (MyPairWritable)obj;
		return first.equals(temp.first) && second.equals(temp.second);
	}
	
	@Override
	public int hashCode() {
		return first.hashCode() * 163 + second.hashCode();
	}
}


转载于:https://www.cnblogs.com/baalhuo/p/5762100.html

MapReduce中,排序是非常重要的。MapReduce在Map和Reduce的两个阶段中都会执行排序操作。全局排序是指在一个MapReduce产生的输出文件中,所有的结果都是按照某个策略进行排序的,例如降还是升。在全局排序中,只有一个reduce任务可以保证数据的全局有,但这样无法充分利用Hadoop集群的优势。 在MapReduce的shuffle过程中,通常会执行多次排序。首先是在Map输出阶段,根据分区和key进行快速排序。然后,在Map的合并溢写文件阶段,将同一个分区的多个溢写文件进行归并排序,合成一个大的溢写文件。最后,在Reduce输入阶段,将同一分区来自不同Map任务的数据文件进行归并排序。最后阶段使用了堆排作为最后的合并过程。 在MapReduce中,有两种排序方式,即快速排序和归并排序。快速排序是通过一趟排序将要排序的数据分割成独立的两部分,然后对这两部分数据分别进行快速排序,最终达到整个数据变成有列的目的。归并排序是建立在归并操作上的一种排序算法,通过将已有的子列合并,得到完全有列。归并排序可以采用分治法的方式进行,将子列逐步合并,最终得到整个列的有结果。 因此,MapReduce中的排序操作是通过多次排序和归并的方式来实现的,以确保数据的有性。<span class="em">1</span><span class="em">2</span><span class="em">3</span><span class="em">4</span>
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值