数据算法(二次排序):对温度数据排序

二次排序针对归约阶段对与某个键关联的值排序 MapReduce
 框架会自动对映射器生成的键完成排序,在启动归约器之前,映射器生成的中间键-值对必然时按键有序的,值不是有序的。
 如下例子:考虑一个可惜实验得到的温度数据。包括year,month,day 和当天温度temperature

 

2012,01,01,5

 2000,12,04,10

 2000,11,01,20 

 2000,12,02,-20
 ... 
  2012,12,21,30 
  2013,01,22,80 
  希望输出每一个"年-月"的温度,而且按值升序
  2012-01:5,10,35...
  2005-08:34,50,52...

package cn.weida.hadoop.SecondarySortDriver;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.sun.swing.internal.plaf.metal.resources.metal_zh_TW;

/**
 * 二次排序针对归约阶段对与某个键关联的值排序 MapReduce
 * 框架会自动对映射器生成的键完成排序,在启动归约器之前,映射器生成的中间键-值对必然时按键有序的,值不是有序的。
 * 如下例子:考虑一个可惜实验得到的温度数据。包括year,month,day 和当天温度temperature 2012,01,01,5
 * 2000,12,04,10
 * 2000,11,01,20 
 * 2000,12,02,-20
 * ... 
 * 2012,12,21,30 
 * 2013,01,22,80 
 * 希望输出每一个"年-月"的温度,而且按值升序
 * 2012-01:5,10,35...
 * 2005-08:34,50,52...
 * @author 
 *
 */
public class SecondarySortDriver extends Configured implements Tool {

	@Override
	public int run(String[] arg0) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = getConf();
		Job job = new Job(conf);
		job.setJarByClass(SecondarySortDriver.class);
		job.setJobName("SecondarySortDriver");

		Path inputPath = new Path(arg0[0]);   //输入文件
		//System.out.println(arg0[0]);           
		Path outputPath = new Path(arg0[1]);   //输出文件
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);

		job.setOutputKeyClass(DateTemperaturePair.class);   //设置输出键类型
		job.setOutputValueClass(NullWritable.class);       //设置输出值类型

		job.setMapperClass(SecondarySortMapper.class);         //设置mapper
		job.setReducerClass(SecondaryReducer.class);         //设置Reducer
		job.setPartitionerClass(DateTemperaturePartutuoner.class);  //设置分区器程序调用此类中的方法确定同一year-month发往同一个Reducer
		job.setGroupingComparatorClass(YearMonthGropingComparator.class);   //设置分组比较器 ,确定同一个year-month 内的顺序

		boolean status = job.waitForCompletion(true);
		return status ? 0 : 1;
	}

	public static void main(String[] args) {
		if (args.length != 2) {
			System.out.println(args.length);
			throw new IllegalArgumentException("Usage : SecondaySortDriver <input-paht> <outpit-path>");
		}

		int returnStatus;
		try {
			returnStatus = ToolRunner.run(new SecondarySortDriver(), args);
			System.exit(returnStatus);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			System.exit(-1);
		}

	}

}

DateTemperaturePair类

package cn.weida.hadoop.SecondarySortDriver;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

/**
 * 键-值对类,将year-month 当作键 temperature 当作值
 * 同时 提供比较方法
 * @author 
 *
 */
public class DateTemperaturePair implements Writable,WritableComparable<DateTemperaturePair>{

	private Text yearMonth = new Text();
	private Text day = new Text();
	private IntWritable temperature = new IntWritable();

	@Override
	public int compareTo(DateTemperaturePair pair) {
		int compareValue = this.yearMonth.compareTo(pair.getYearMonth());
		if (compareValue==0) {
			compareValue = temperature.compareTo(pair.getTemperature());
		}
		//return compareValue; //降序
		return -1*compareValue; //升序
	}

	public Text getYearMonth() {
		return yearMonth;
	}

	public void setYearMonth(Text yearMonth) {
		this.yearMonth = yearMonth;
	}

	public Text getDay() {
		return day;
	}

	public void setDay(Text day) {
		this.day = day;
	}

	public IntWritable getTemperature() {
		return temperature;
	}

	public void setTemperature(IntWritable temperature) {
		this.temperature = temperature;
	}

	@Override
	public void readFields(DataInput arg0) throws IOException {
		// TODO Auto-generated method stub
		
	}

	@Override
	public void write(DataOutput arg0) throws IOException {
		// TODO Auto-generated method stub
		
	}

}

DateTemperaturePartutuoner类

package cn.weida.hadoop.SecondarySortDriver;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 分区器
 * @author 
 *
 */
public class DateTemperaturePartutuoner extends Partitioner<DateTemperaturePair, Text>{

	@Override
	public int getPartition(DateTemperaturePair pair, Text text, int numberOfPartition) {
		
		//确定分区数非负
		return Math.abs(pair.getYearMonth().hashCode()%numberOfPartition);
	}

	
}

SecondaryReducer类

package cn.weida.hadoop.SecondarySortDriver;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.yarn.webapp.ForbiddenException;

/**
 * 将传进来的温度值用","隔开
 * @author 
 *
 */
public class SecondaryReducer extends Reducer<Writable, IntWritable, Writable, Text>{
	
 protected void reduce(Writable key, Iterable<IntWritable> value,OutputCollector<Writable, Text> output) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
	 StringBuilder sortedTemperatureList = new StringBuilder();
		for (IntWritable temperature : value) {
			sortedTemperatureList.append(temperature.toString());
			sortedTemperatureList.append(",");
		}
		output.collect(key, new Text(sortedTemperatureList.toString()));
			
 }

	
}

SecondarySortMapper类

package cn.weida.hadoop.SecondarySortDriver;

import java.io.IOException;

import org.apache.commons.collections.iterators.CollatingIterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Mapper;
/**
 * 将输入进行切分
 * 使用DateTemperaturePair对象封装键和值
 * @author 
 *
 */
public class SecondarySortMapper extends Mapper<IntWritable, Text, Writable, IntWritable>{

	protected void map(IntWritable key, Text value,OutputCollector<Writable, IntWritable> output)
			throws IOException, InterruptedException {
		String[] tokens =value.toString().split(",");
		String yearMonth = tokens[0]+tokens[1];
		String day = tokens[2];
		int temperature = Integer.parseInt(tokens[3]);
		DateTemperaturePair reducerkey = new DateTemperaturePair();
		reducerkey.setYearMonth(new Text(yearMonth));
		reducerkey.setDay(new Text(day));
		reducerkey.setTemperature(new IntWritable(temperature));
		output.collect(reducerkey, new IntWritable(temperature));
		
	}

	
}

YearMonthGropingComparator类

package cn.weida.hadoop.SecondarySortDriver;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * 分组比较器
 * 调用pair.getYearMonth().compareTo
 * @author 
 *
 */
public class YearMonthGropingComparator extends WritableComparator{

	public YearMonthGropingComparator() {
		// TODO Auto-generated constructor stub
		super(DateTemperaturePair.class,true);
	}
	/**
	 * 这个比较器控制哪些键要分组到一个reduce()
	 */
	public int compare(WritableComparable wc1, WritableComparable wc2) {
		// TODO Auto-generated method stub
		DateTemperaturePair pair = (DateTemperaturePair)wc1;
		DateTemperaturePair pair2= (DateTemperaturePair)wc2;
		return pair.getYearMonth().compareTo(pair2.getYearMonth());
	}
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值