3.3 求每个订单中最贵的商品(GroupingComparator)
1)需求
有如下订单数据
2)输入数据
Order_0000001 Pdt_01 222.8
Order_0000002 Pdt_05 722.4
Order_0000001 Pdt_05 25.8
Order_0000003 Pdt_01 222.8
Order_0000003 Pdt_01 33.8
Order_0000002 Pdt_03 522.8
Order_0000002 Pdt_04 122.4
输出数据预期:
part-r-00000.txt:
Order_0000001 222.8
part-r-00001.txt:
Order_0000002 722.4
part-r-00002.txt:
Order_0000003 222.8
3)分析
(1)利用“订单id和成交金额”作为key,可以将map阶段读取到的所有订单数据按照id分区,按照金额排序,发送到reduce。
(2)在reduce端利用groupingcomparator将订单id相同的kv聚合成组,然后取第一个即是最大值。
4)实现
定义订单信息OrderBean
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId; // 订单id
private Double price; // 商品价格
public OrderBean() {
super();
}
public OrderBean(String orderId, Double price) {
super();
this.orderId = orderId;
this.price = price;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public Double getPrice() {
return price;
}
public void setPrice(Double price) {
this.price = price;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(orderId);
out.writeDouble(price);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.price = in.readDouble();
}
@Override
public int compareTo(OrderBean o) {
// 两次排序
// 1 按照id号排序
int comResult = this.orderId.compareTo(o.getOrderId());
if (comResult == 0) {
// 2 按照价格倒序排序
comResult = this.price > o.getPrice()?-1:1;
}
return comResult;
}
@Override
public String toString() {
return orderId + "\t" + price;
}
}
编写OrderMapper处理流程
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class OrderMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
OrderBean bean = new OrderBean();
@Override
protected void map(LongWritable key, Text value,
Context context) throws IOException, InterruptedException {
// 1 读取数据
String line = value.toString();
// 2 切割数据
String[] fields = line.split("\t");
// Order_0000002 Pdt_03 522.8
// 3 封装bean对象
bean.setOrderId(fields[0]);
bean.setPrice(Double.parseDouble(fields[2]));
// 4 写出
context.write(bean, NullWritable.get());
}
}
编写OrderReducer处理流程
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class OrderReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable>{
@Override
protected void reduce(OrderBean bean, Iterable<NullWritable> values,
Context context) throws IOException, InterruptedException {
// 写出
context.write(bean, NullWritable.get());
}
}
编写OrderDriver处理流程
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class OrderDriver {
public static void main(String[] args) throws Exception {
// 1 获取配置信息
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2 设置jar包加载路径
job.setJarByClass(OrderDriver.class);
// 3 加载map/reduce类
job.setMapperClass(OrderMapper.class);
job.setReducerClass(OrderReducer.class);
// 4 设置map输出数据key和value类型
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
// 5 设置最终输出数据的key和value类型
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
// 6 设置输入数据和输出数据路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 10 关联groupingComparator
job.setGroupingComparatorClass(OrderGroupingCompartor.class);
// 7 设置分区
job.setPartitionerClass(OrderPatitioner.class);
// 8 设置reduce个数
job.setNumReduceTasks(3);
// 9 提交
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
编写OrderPartitioner处理流程
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class OrderPatitioner extends Partitioner<OrderBean, NullWritable>{
@Override
public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
// 按照key的orderid的hashCode值分区
return (key.getOrderId().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
编写OrderGroupingComparator处理流程
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class OrderGroupingCompartor extends WritableComparator {
// 写一个空参构造
public OrderGroupingCompartor(){
super(OrderBean.class, true);
}
// 重写比较的方法
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean aBean = (OrderBean) a;
OrderBean bBean = (OrderBean) b;
// 根据订单id号比较,判断是否是一组
return aBean.getOrderId().compareTo(bBean.getOrderId());
}
}