二次排序针对归约阶段对与某个键关联的值排序 MapReduce
框架会自动对映射器生成的键完成排序,在启动归约器之前,映射器生成的中间键-值对必然时按键有序的,值不是有序的。
如下例子:考虑一个可惜实验得到的温度数据。包括year,month,day 和当天温度temperature
2012,01,01,5
2000,12,04,10
2000,11,01,20
2000,12,02,-20
...
2012,12,21,30
2013,01,22,80
希望输出每一个"年-月"的温度,而且按值升序
2012-01:5,10,35...
2005-08:34,50,52...
package cn.weida.hadoop.SecondarySortDriver;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.sun.swing.internal.plaf.metal.resources.metal_zh_TW;
/**
* 二次排序针对归约阶段对与某个键关联的值排序 MapReduce
* 框架会自动对映射器生成的键完成排序,在启动归约器之前,映射器生成的中间键-值对必然时按键有序的,值不是有序的。
* 如下例子:考虑一个可惜实验得到的温度数据。包括year,month,day 和当天温度temperature 2012,01,01,5
* 2000,12,04,10
* 2000,11,01,20
* 2000,12,02,-20
* ...
* 2012,12,21,30
* 2013,01,22,80
* 希望输出每一个"年-月"的温度,而且按值升序
* 2012-01:5,10,35...
* 2005-08:34,50,52...
* @author
*
*/
public class SecondarySortDriver extends Configured implements Tool {
@Override
public int run(String[] arg0) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = getConf();
Job job = new Job(conf);
job.setJarByClass(SecondarySortDriver.class);
job.setJobName("SecondarySortDriver");
Path inputPath = new Path(arg0[0]); //输入文件
//System.out.println(arg0[0]);
Path outputPath = new Path(arg0[1]); //输出文件
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
job.setOutputKeyClass(DateTemperaturePair.class); //设置输出键类型
job.setOutputValueClass(NullWritable.class); //设置输出值类型
job.setMapperClass(SecondarySortMapper.class); //设置mapper
job.setReducerClass(SecondaryReducer.class); //设置Reducer
job.setPartitionerClass(DateTemperaturePartutuoner.class); //设置分区器程序调用此类中的方法确定同一year-month发往同一个Reducer
job.setGroupingComparatorClass(YearMonthGropingComparator.class); //设置分组比较器 ,确定同一个year-month 内的顺序
boolean status = job.waitForCompletion(true);
return status ? 0 : 1;
}
public static void main(String[] args) {
if (args.length != 2) {
System.out.println(args.length);
throw new IllegalArgumentException("Usage : SecondaySortDriver <input-paht> <outpit-path>");
}
int returnStatus;
try {
returnStatus = ToolRunner.run(new SecondarySortDriver(), args);
System.exit(returnStatus);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.exit(-1);
}
}
}
DateTemperaturePair类
package cn.weida.hadoop.SecondarySortDriver;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
/**
* 键-值对类,将year-month 当作键 temperature 当作值
* 同时 提供比较方法
* @author
*
*/
public class DateTemperaturePair implements Writable,WritableComparable<DateTemperaturePair>{
private Text yearMonth = new Text();
private Text day = new Text();
private IntWritable temperature = new IntWritable();
@Override
public int compareTo(DateTemperaturePair pair) {
int compareValue = this.yearMonth.compareTo(pair.getYearMonth());
if (compareValue==0) {
compareValue = temperature.compareTo(pair.getTemperature());
}
//return compareValue; //降序
return -1*compareValue; //升序
}
public Text getYearMonth() {
return yearMonth;
}
public void setYearMonth(Text yearMonth) {
this.yearMonth = yearMonth;
}
public Text getDay() {
return day;
}
public void setDay(Text day) {
this.day = day;
}
public IntWritable getTemperature() {
return temperature;
}
public void setTemperature(IntWritable temperature) {
this.temperature = temperature;
}
@Override
public void readFields(DataInput arg0) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void write(DataOutput arg0) throws IOException {
// TODO Auto-generated method stub
}
}
DateTemperaturePartutuoner类
package cn.weida.hadoop.SecondarySortDriver;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 分区器
* @author
*
*/
public class DateTemperaturePartutuoner extends Partitioner<DateTemperaturePair, Text>{
@Override
public int getPartition(DateTemperaturePair pair, Text text, int numberOfPartition) {
//确定分区数非负
return Math.abs(pair.getYearMonth().hashCode()%numberOfPartition);
}
}
SecondaryReducer类
package cn.weida.hadoop.SecondarySortDriver;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.yarn.webapp.ForbiddenException;
/**
* 将传进来的温度值用","隔开
* @author
*
*/
public class SecondaryReducer extends Reducer<Writable, IntWritable, Writable, Text>{
protected void reduce(Writable key, Iterable<IntWritable> value,OutputCollector<Writable, Text> output) throws IOException, InterruptedException {
// TODO Auto-generated method stub
StringBuilder sortedTemperatureList = new StringBuilder();
for (IntWritable temperature : value) {
sortedTemperatureList.append(temperature.toString());
sortedTemperatureList.append(",");
}
output.collect(key, new Text(sortedTemperatureList.toString()));
}
}
SecondarySortMapper类
package cn.weida.hadoop.SecondarySortDriver;
import java.io.IOException;
import org.apache.commons.collections.iterators.CollatingIterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Mapper;
/**
* 将输入进行切分
* 使用DateTemperaturePair对象封装键和值
* @author
*
*/
public class SecondarySortMapper extends Mapper<IntWritable, Text, Writable, IntWritable>{
protected void map(IntWritable key, Text value,OutputCollector<Writable, IntWritable> output)
throws IOException, InterruptedException {
String[] tokens =value.toString().split(",");
String yearMonth = tokens[0]+tokens[1];
String day = tokens[2];
int temperature = Integer.parseInt(tokens[3]);
DateTemperaturePair reducerkey = new DateTemperaturePair();
reducerkey.setYearMonth(new Text(yearMonth));
reducerkey.setDay(new Text(day));
reducerkey.setTemperature(new IntWritable(temperature));
output.collect(reducerkey, new IntWritable(temperature));
}
}
YearMonthGropingComparator类
package cn.weida.hadoop.SecondarySortDriver;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 分组比较器
* 调用pair.getYearMonth().compareTo
* @author
*
*/
public class YearMonthGropingComparator extends WritableComparator{
public YearMonthGropingComparator() {
// TODO Auto-generated constructor stub
super(DateTemperaturePair.class,true);
}
/**
* 这个比较器控制哪些键要分组到一个reduce()
*/
public int compare(WritableComparable wc1, WritableComparable wc2) {
// TODO Auto-generated method stub
DateTemperaturePair pair = (DateTemperaturePair)wc1;
DateTemperaturePair pair2= (DateTemperaturePair)wc2;
return pair.getYearMonth().compareTo(pair2.getYearMonth());
}
}