partition分区
-
分区数决定了reducer数,业务数决定分区数
-
默认情况下,job设置的reduceTasks为1,每个reduce对应生成一个结果文件
-
reduce个数分为几种情况:
- ReduceTasks:0【无reduce阶段】
- ReduceTasks:1【默认】
- ReduceTasks:n【分多个文件】
-
partition分区函数
-
采用【org.apache.hadoop.mapreduce.Partitioner】抽象类
-
分区器按键值分区
-
HashPartitioner提供了getPartition()方法,用于获取当前key对应的分区号,通常采用hash函数
public int getPartition(K key, V value, int numReduceTasks) {
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}
-
-
自定义分区:重写Partitioner的实现类
泛型同Mapper的输出,继承于Partitioner
返回值决定了输出文件的文件名序号,从0开始,连续的5位,能够分100000个文件
1. 自定义Partitioner分区器
1.1 测试数据
9
32
65
24
52
84
83
38
58
4
65
45
38
20
24
38
69
67
12
65
69
77
71
60
95
43
31
28
36
74
1.2 自定义InputFormat
package num_Partitioner;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class LineNumInputFormat extends FileInputFormat<LongWritable, Text> {
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new LineNumRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
}
1.3 自定义RecordReader
package num_Partitioner;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
public class LineNumRecordReader extends RecordReader<LongWritable, Text> {
private long start;
private long pos;
private long end;
private LineReader in;
private FSDataInputStream fileIn;
private LongWritable key;
private Text value;
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit _split = (FileSplit) split;
Path file = _split.getPath();
FileSystem fs = file.getFileSystem(context.getConfiguration());
fileIn = fs.open(file);
fileIn.seek(start);
in = new LineReader(fileIn);
start = _split.getStart();
end = start + _split.getLength();
pos = 1;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(key == null) {
key = new LongWritable();
}
key.set(pos);
if(value ==null) {
value = new Text();
}
if (in.readLine(value) == 0) {
return false;
}
pos++;
return true;
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
in.close();
}
}
1.4 自定义Partitioner
package num_Partitioner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class NumPartitioner extends Partitioner<IntWritable, IntWritable> {
@Override
public int getPartition(IntWritable key, IntWritable value, int numPartitions) {
if(key.get() >= 20) {
return 2;
}
else if(key.get() >= 10) {
return 1;
}
else
return 0;
}
}
1.5 Mapper
package num_Reducers;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class NumMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
IntWritable _value = new IntWritable();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, IntWritable, IntWritable>.Context context)
throws IOException, InterruptedException {
_value.set(Integer.valueOf(value.toString()));
context.write(_value, _value);
}
}
1.6 Reducer
package num_Reducers;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class NumReducer extends Reducer<IntWritable, IntWritable, IntWritable, NullWritable> {
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values,
Reducer<IntWritable, IntWritable, IntWritable, NullWritable>.Context context) throws IOException, InterruptedException {
for (IntWritable value : values) {
context.write(value, NullWritable.get());
}
}
}
1.7 Driver
package num_Reducers;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import num_Partitioner.NumPartitioner;
public class NumDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name", "local");
Path outPut = new Path("file:///D:/out");
FileSystem fs = outPut.getFileSystem(conf);
if(fs.exists(outPut)) {
fs.delete(outPut, true);
}
Job job = Job.getInstance(conf);
job.setJobName("age");
job.setJarByClass(NumDriver.class);
job.setMapperClass(NumMapper.class);
job.setReducerClass(NumReducer.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputValueClass(NullWritable.class);
job.setInputFormatClass(LineNumInputFormat.class);
job.setNumReduceTasks(3);
job.setPartitionerClass(NumPartitioner.class);
FileInputFormat.addInputPath(job, new Path("file:///D:/age"));
FileOutputFormat.setOutputPath(job, outPut);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
1.8 结果数据
-
part-r-00000
4
9
-
part-r-00001
12
-
part-r-00002
20
24
24
28
31
32
36
38
38
38
43
45
52
58
60
65
65
65
67
69
69
71
74
77
83
84
95