二次排序（GroupingComparator）_comparator排序两次-优快云博客

该博客介绍了如何使用Hadoop MapReduce解决实际问题，即找出每个订单中最贵的商品。通过订单id和成交金额进行键值对设置，Map阶段按id分区并排序，Reduce阶段通过自定义的GroupingComparator和Partitioner获取每个订单的最大价格商品。博客内容涵盖了OrderBean定义、Mapper、Partitioner、GroupingComparator和Reducer的实现细节。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1）需求

有如下订单数据

订单id	商品id	成交金额
0000001	Pdt_01	222.8
0000001	Pdt_06	25.8
0000002	Pdt_03	522.8
0000002	Pdt_04	122.4
0000002	Pdt_05	722.4
0000003	Pdt_01	222.8
0000003	Pdt_02	33.8

现在需要求出每一个订单中最贵的商品。

2）输入数据：输出数据预期：

3）分析

（1）利用“订单id和成交金额”作为key，可以将map阶段读取到的所有订单数据按照id分区，按照金额排序，发送到reduce。

（2）在reduce端利用groupingcomparator将订单id相同的kv聚合成组，然后取第一个即是最大值。

4）代码实现

（1）定义订单信息OrderBean

package com.lzz.twoOrder;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

public class OrderBean implements WritableComparable<OrderBean>{
	private long orderId;
	private double orderPrice;

	
	
	@Override
	public String toString() {
		return  orderId + "\t" + orderPrice;
	}

	public OrderBean() {
		super();
	}

	public OrderBean(long orderId, double orderPrice) {
		super();
		this.orderId = orderId;
		this.orderPrice = orderPrice;
	}

	public long getOrderId() {
		return orderId;
	}

	public void setOrderId(long orderId) {
		this.orderId = orderId;
	}

	public double getOrderPrice() {
		return orderPrice;
	}

	public void setOrderPrice(double orderPrice) {
		this.orderPrice = orderPrice;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeLong(orderId);
		out.writeDouble(orderPrice);
	}
	
	@Override
	public void readFields(DataInput in) throws IOException {
		this.orderId=in.readLong();
		this.orderPrice=in.readDouble();
	}

	//二次排序
	@Override
	public int compareTo(OrderBean orderBean) {
		int res;
		//订单号正序排序
		if(orderId>orderBean.getOrderId()) {
			res=1;
		}else if (orderId<orderBean.getOrderId()) {
			res=-1;
		}else{
		//价格倒序排序
			res=orderPrice>orderBean.getOrderPrice()?-1:1;
		}
		return res;
	}

	
	
}

（2）编写OrderSortMapper

package com.lzz.twoOrder;


import java.io.IOException;

import org.apache.commons.lang.ObjectUtils.Null;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TwoOrderMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
	OrderBean k=new OrderBean();
	
	@Override
	protected void map(LongWritable key, Text value,
			Context context)
			throws IOException, InterruptedException {
		String line=value.toString();
		String[] words=line.split("\t");
		
		k.setOrderId(Long.parseLong(words[0]));
		k.setOrderPrice(Double.parseDouble(words[2]));
		context.write(k, NullWritable.get());
	}
}

（3）编写OrderSortPartitioner

Ctrl+T使用HashCode

package com.lzz.twoOrder;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class TwoOrderPartitioner extends Partitioner<OrderBean, NullWritable>{

	@Override
	public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
		
		return (int)(key.getOrderId() & Integer.MAX_VALUE) % numPartitions;
		//到目前为止，已经排序 分区完成
//		1	222.8
//		1	33.8
//		1	25.8
		
//		3	222.8
//		3	33.8
	}

}

（4）编写OrderSortGroupingComparator

package com.lzz.twoOrder;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class OrderGroupingComparator extends WritableComparator{

	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		OrderBean aBean=(OrderBean)a;
		OrderBean bBean=(OrderBean)b;
		
		int res;
		if(aBean.getOrderId()>bBean.getOrderId()) {
			res=1;
		}else if(aBean.getOrderId()<bBean.getOrderId()) {
			res=-1;
		}else {
			res=0;
		}
		return res;
	}
	
	protected OrderGroupingComparator() {
		super(OrderBean.class,true);
	}
}

（5）编写OrderSortReducer

package com.lzz.twoOrder;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.lzz.order.OrderDriver;
import com.lzz.order.OrderMapper;
import com.lzz.order.OrderReducer;

public class TwoOrderDriver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration configuration=new Configuration();
		Job job=Job.getInstance(configuration);
		
		job.setJarByClass(TwoOrderDriver.class);
		
		job.setMapperClass(TwoOrderMapper.class);
		job.setReducerClass(TwoOrderReducer.class);
		
		job.setMapOutputKeyClass(OrderBean.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		// 设置reduce端的分组
		job.setGroupingComparatorClass(OrderGroupingComparator.class);

		//  设置分区
		job.setPartitionerClass(TwoOrderPartitioner.class);

		//  设置reduce个数
		job.setNumReduceTasks(3);
		
		job.setOutputKeyClass(OrderBean.class);
		job.setOutputValueClass(NullWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		boolean res=job.waitForCompletion(true);
		System.exit(res?0:1);
	}
}