mapreduce笔记

最新推荐文章于 2024-06-20 08:04:37 发布

转载最新推荐文章于 2024-06-20 08:04:37 发布 · 283 阅读

一、 inputSplit

InputSplit是指分片，在MapReduce当中作业中，作为map task最小输入单位。分片是基于文件基础上出来的而来的概念，通俗的理解一个文件可以切分为多少个片段，每个片段包括了<文件名，开始位置，长度，位于哪些主机>等信息。在MapTask拿到这些分片后，会知道从哪开始读取数据。

二、处理阶段

input->map->partitions->sort->combine(到这里是属于map task)->shuffle->reduce->output(这部分属于reduce task)

三、排序 [first,second]

package mr;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SortTest {
	private static class MyNewKey implements WritableComparable<MyNewKey> {
		long firstNum;
		long secondNum;
		
		public MyNewKey() {
        }

		public MyNewKey(long firstNum, long secondNum) {
			super();
			this.firstNum = firstNum;
			this.secondNum = secondNum;
		}

		@Override
		public void write(DataOutput out) throws IOException {
			out.writeLong(firstNum);
			out.writeLong(secondNum);
		}

		@Override
		public void readFields(DataInput in) throws IOException {
			firstNum = in.readLong();
			secondNum = in.readLong();
		}

		/*
		 * 当key进行排序时会调用以下这个compreTo方法
		 */
		@Override
		public int compareTo(MyNewKey anotherKey) {
			System.out.println();
			System.out.println("sorting!!!");
			System.out.println("thisone:"+firstNum+","+secondNum+"anotherone:"+anotherKey.firstNum+","+anotherKey.secondNum);
			long min = firstNum - anotherKey.firstNum;
			if (min != 0) {
				// 说明第一列不相等，则返回两数之间小的数
				return (int) min;
			} else {
				return (int) (secondNum - anotherKey.secondNum);
			}
		}
	}

	public static class MyMapper extends Mapper<LongWritable, Text, MyNewKey, LongWritable> {

		protected void map(LongWritable key, Text value,
				Context context)
				throws java.io.IOException, InterruptedException {

			String[] spilted = value.toString().split(",");
			long firstNum = Long.parseLong(spilted[0]);
			long secondNum = Long.parseLong(spilted[1]);
			// 使用新的类型作为key参与排序
			MyNewKey newKey = new MyNewKey(firstNum, secondNum);
			context.write(newKey, new LongWritable(secondNum));
			System.out.println("mapping~");
		}
	}

	public static class MyReducer extends Reducer<MyNewKey, LongWritable, LongWritable, LongWritable> {

		protected void reduce(MyNewKey key, java.lang.Iterable<LongWritable> values,
				Context context)
				throws java.io.IOException, InterruptedException {

			context.write(new LongWritable(key.firstNum), new LongWritable(key.secondNum));
			System.out.println("reducing~");
		}
	}

	private static String INPUT_PATH = "hdfs://master:9000/input/cp.txt";
	private static String OUTPUT_PATH = "hdfs://master:9000/output/c";

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		Configuration conf = new Configuration();
		try {
			FileSystem  fs=FileSystem.get(new URI(OUTPUT_PATH),conf);
			if(fs.exists(new Path(OUTPUT_PATH)))
				fs.delete(new Path(OUTPUT_PATH));
		} catch (URISyntaxException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		Job job = new Job(conf, "myjob");
		job.setJarByClass(SortTest.class);
		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReducer.class);
		//job.setCombinerClass(MyReducer.class);

		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(LongWritable.class);
		
		job.setMapOutputKeyClass(MyNewKey.class);
		job.setMapOutputValueClass(LongWritable.class);

		FileInputFormat.addInputPath(job, new Path(INPUT_PATH));
		FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));

		job.waitForCompletion(true);
	}
}