partition分区

最新推荐文章于 2025-03-13 11:29:18 发布

geekLinyi

最新推荐文章于 2025-03-13 11:29:18 发布

阅读量216

点赞数

分类专栏： hadoop 文章标签： Partitioner MapReduce

本文链接：https://blog.youkuaiyun.com/weixin_43855370/article/details/101722248

版权

hadoop 专栏收录该内容

25 篇文章

订阅专栏

本文深入探讨了Hadoop中partition分区的概念，解释了分区数如何决定reducer数量，并介绍了默认情况下reduceTasks设置为1时的处理流程。文章详细解析了自定义Partitioner分区器的实现，包括自定义InputFormat、RecordReader、Mapper、Reducer和Driver的过程，以及自定义Partitioner的具体代码示例。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

partition分区

分区数决定了reducer数，业务数决定分区数
默认情况下，job设置的reduceTasks为1，每个reduce对应生成一个结果文件
reduce个数分为几种情况：
- ReduceTasks：0【无reduce阶段】
- ReduceTasks：1【默认】
- ReduceTasks：n【分多个文件】
partition分区函数
- 采用【org.apache.hadoop.mapreduce.Partitioner】抽象类
- 分区器按键值分区
- HashPartitioner提供了getPartition()方法，用于获取当前key对应的分区号，通常采用hash函数
  
  public int getPartition(K key, V value, int numReduceTasks) {
  return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }
自定义分区：重写Partitioner的实现类

泛型同Mapper的输出，继承于Partitioner

返回值决定了输出文件的文件名序号，从0开始，连续的5位，能够分100000个文件

1. 自定义Partitioner分区器

1.1 测试数据

9
32
65
24
52
84
83
38
58
4
65
45
38
20
24
38
69
67
12
65
69
77
71
60
95
43
31
28
36
74

1.2 自定义InputFormat

package num_Partitioner;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class LineNumInputFormat extends FileInputFormat<LongWritable, Text> {
	@Override
	public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
			throws IOException, InterruptedException {
		return new LineNumRecordReader();
	}
	@Override
	protected boolean isSplitable(JobContext context, Path filename) {
		return false;
	}
}

1.3 自定义RecordReader

package num_Partitioner;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;

public class LineNumRecordReader extends RecordReader<LongWritable, Text> {
	private long start;
	private long pos;
	private long end;
	private LineReader in;
	private FSDataInputStream fileIn;
	private LongWritable key;
	private Text value;
	
	@Override
	public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
		FileSplit _split = (FileSplit) split;
		Path file = _split.getPath();
		FileSystem fs = file.getFileSystem(context.getConfiguration());
		fileIn = fs.open(file);
		fileIn.seek(start);
		in = new LineReader(fileIn);
		start = _split.getStart();
		end = start + _split.getLength();
		pos = 1;
	}
	
	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		if(key == null) {
			key = new LongWritable();
		}
		key.set(pos);
		if(value ==null) {
			value = new Text();
		}
		if (in.readLine(value) == 0) {
			return false;
		}
		pos++;
	    return true;
	}
	
	@Override
	public LongWritable getCurrentKey() throws IOException, InterruptedException {
		return key;
	}
	
	@Override
	public Text getCurrentValue() throws IOException, InterruptedException {
		return value;
	}
	
	@Override
	public float getProgress() throws IOException, InterruptedException {
		return 0;
	}
	
	@Override
	public void close() throws IOException {
		in.close();
	}
}

1.4 自定义Partitioner

package num_Partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class NumPartitioner extends Partitioner<IntWritable, IntWritable> {
	@Override
	public int getPartition(IntWritable key, IntWritable value, int numPartitions) {
		if(key.get() >= 20) {
			return 2;
		}
		else if(key.get() >= 10) {
			return 1;
		}
		else
			return 0;
	}
}

1.5 Mapper

package num_Reducers;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class NumMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
	IntWritable _value = new IntWritable();
	@Override
	protected void map(LongWritable key, Text value,
			Mapper<LongWritable, Text, IntWritable, IntWritable>.Context context)
			throws IOException, InterruptedException {
		_value.set(Integer.valueOf(value.toString()));
		context.write(_value, _value);
	}
}

1.6 Reducer

package num_Reducers;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class NumReducer extends Reducer<IntWritable, IntWritable, IntWritable, NullWritable> {
	@Override
	protected void reduce(IntWritable key, Iterable<IntWritable> values,
			Reducer<IntWritable, IntWritable, IntWritable, NullWritable>.Context context) throws IOException, InterruptedException {
		for (IntWritable value : values) {
			context.write(value, NullWritable.get());
		}
	}
}

1.7 Driver

package num_Reducers;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import num_Partitioner.NumPartitioner;

public class NumDriver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		conf.set("mapreduce.framework.name", "local");
		Path outPut = new Path("file:///D:/out");
		FileSystem fs = outPut.getFileSystem(conf);
		if(fs.exists(outPut)) {
			fs.delete(outPut, true);
		}
		Job job = Job.getInstance(conf);
		job.setJobName("age");
		job.setJarByClass(NumDriver.class);
		job.setMapperClass(NumMapper.class);
		job.setReducerClass(NumReducer.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputValueClass(NullWritable.class);
		job.setInputFormatClass(LineNumInputFormat.class);
		job.setNumReduceTasks(3);
		job.setPartitionerClass(NumPartitioner.class);
		FileInputFormat.addInputPath(job, new Path("file:///D:/age"));
		FileOutputFormat.setOutputPath(job, outPut);
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}