MapReduce的分片机制源码解析_mapreduce动态分片-优快云博客

public class FileSplit extends InputSplit implements Writable {
    private Path file; //要处理的⽂件名
    private long start; //当前逻辑分⽚的偏移量
    private long length; //当前逻辑分⽚的字节⻓度
    private String[] hosts; //当前逻辑分⽚对应的块数据所在的主机名
    private SplitLocationInfo[] hostInfos;

    public FileSplit() {}
    public FileSplit(Path file, long start, long length,String[] hosts) {
        this.file = file; //创建逻辑分⽚对象时调⽤的构造器
        this.start = start;
        this.length = length;
        this.hosts = hosts;
    }
    //.....
}

2）FileInputFormat源码解析

public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
    public static final String NUM_INPUT_FILES;
    public static final String INPUT_DIR_RECURSIVE;
    private static final double SPLIT_SLOP = 1.1;
    private long minSplitSize = 1;

    //........

    protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
        return new FileSplit(file, start, length, hosts);
     }

    //.....

    public InputSplit[] getSplits(JobConf job, int numSplits)throws IOException {
    // 获取⽂件的状态信息
    FileStatus[] files = listStatus(job);
    // Save the number of input files for metrics/loadgen
     job.setLong(NUM_INPUT_FILES, files.length);
     long totalSize = 0; // compute total size
     for (FileStatus file: files) { // check we have valid files

     //.....

     totalSize += file.getLen();
 }
     long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
     long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.
         FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);

     // generate splits
     ArrayList<FileSplit> splits = new ArrayList<FileSplit> (numSplits);
     for (FileStatus file: files) {
         Path path = file.getPath();
         long length = file.getLen();
         if (length != 0) {
             FileSystem fs = path.getFileSystem(job);
             BlockLocation[] blkLocations;

             //.......

             if (isSplitable(fs, path)) {
                 long blockSize = file.getBlockSize();
                 long splitSize = computeSplitSize(goalSize,minSize, blockSize);
                 long bytesRemaining = length;
                 while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {

                 //......

                splits.add(makeSplit(path, lengthbytesRemaining, splitSize,splitHosts[0],splitHosts[1]));
                 bytesRemaining -= splitSize;
                 }
                if (bytesRemaining != 0) {
                String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap);
                splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,splitHosts[0],splitHosts[1]));
                 }
             } else {
                 //......
             }
         } else {
             //......
         }
     }
     //......
     return splits.toArray(new FileSplit[splits.size()]);
 }
 /**
 * 计算分⽚⼤⼩⽅法
 */
 protected long computeSplitSize(long blockSize, long minSize,long maxSize) {
     return Math.max(minSize, Math.min(maxSize, blockSize));
    }
}

3）TextInputFormat源码解析

public class TextInputFormat extends FileInputFormat<LongWritable, Text> {
    @Override
    public RecordReader<LongWritable, Text> createRecordReader(InputSplit split,TaskAttemptContext context) {
        String delimiter = context.getConfiguration().get("textinputformat.record.delimiter");
        byte[] recordDelimiterBytes = null;
        if (null != delimiter)
            recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
        return new LineRecordReader(recordDelimiterBytes);
    }
    @Override
    protected boolean isSplitable(JobContext context, Path file) {
        final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
        if (null == codec) {
            return true;
        }
        return codec instanceof SplittableCompressionCodec;
    }
}

4） LineRecordReader源码解析

public class LineRecordReader extends RecordReader<LongWritable, Text> {
    private static final Log LOG = LogFactory.getLog(LineRecordReader.class);
    public static final String MAX_LINE_LENGTH = "mapreduce.input.linerecordreader.line.maxlength";
    private long start;
    private long pos;
    private long end;
    private SplitLineReader in;
    private FSDataInputStream fileIn;
    private Seekable filePosition;
    private int maxLineLength;
    private LongWritable key;
    private Text value;
    private boolean isCompressedInput;
    private Decompressor decompressor;
    private byte[] recordDelimiterBytes;  
  
    public LineRecordReader() {
    }
    public LineRecordReader(byte[] recordDelimiter) {
        this.recordDelimiterBytes = recordDelimiter;
    }
    public void initialize(InputSplit genericSplit,TaskAttemptContext context) throws IOException {
        FileSplit split = (FileSplit) genericSplit;
        Configuration job = context.getConfiguration();
        this.maxLineLength = job.getInt(MAX_LINE_LENGTH,Integer.MAX_VALUE);
        start = split.getStart();
        end = start + split.getLength();
        final Path file = split.getPath();
        // open the file and seek to the start of the split
        final FileSystem fs = file.getFileSystem(job);
        fileIn = fs.open(file);
 
        CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
        if (null!=codec) {
        //...................
        } else {
            fileIn.seek(start);
            in = new UncompressedSplitLineReader(
            fileIn, job, this.recordDelimiterBytes,    
            split.getLength());
            filePosition = fileIn;
        }
        // If this is not the first split, we always throw away first record
        // because we always (except the last split) read one extra line in        
        // next() method.
        if (start != 0) {
            start += in.readLine(new Text(), 0,maxBytesToConsume(start));
        }
        this.pos = start;
    }

    //.....

    public boolean nextKeyValue() throws IOException {
        if (key == null) {
            key = new LongWritable();
        }
        key.set(pos);
        if (value == null) {
            value = new Text();
        }
        int newSize = 0;
        // We always read one extra line, which lies outside the upper
        // split limit i.e. (end - 1)
        while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
            if (pos == 0) {
                newSize = skipUtfByteOrderMark();
            } else {
                newSize = in.readLine(value, maxLineLength,maxBytesToConsume(pos));
                pos += newSize;
            }

            if ((newSize == 0) || (newSize < maxLineLength)) {
                break;
            }
            // line too long. try again
             LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
            }
            if (newSize == 0) {
                key = null;
                value = null;
                return false;
            } else {
                return true;
            }
        }
    @Override
    public LongWritable getCurrentKey() {
        return key;
    }

    @Override
    public Text getCurrentValue() {
        return value;
    }
}

四、分⽚总结

1) 分⽚⼤⼩参数

通过分析源码，在FileInputFormat 中，计算切⽚⼤⼩的逻辑：

Math.max(minSize,Math.min(maxSize, blockSize)); 切⽚主要由这⼏个值来运算决定

参数	默认值	属性
minsize	1	mapreduce.input.fileinputformat.split.minsize
maxsize	Long.MAXVALUE	mapreduce.input.fileinputformat.split.maxsize
blocksize	块⼤⼩	dfs.blocksize: