LineRecordReader由一个FileSplit构造出来,start是这个FileSplit的起始位置,pos是当前读取分片的位 置,end是分片结束位置,in是打开的一个读取这个分片的输入流,它是使用这个FileSplit对应的文件名来打开的。key和value则分别是每
次读取的K-V对。然后我们还看到可以利用getProgress()来跟踪读取分片的进度,这个函数就是根据已经读取的K-V对占总K-V对的比例来显 示进度的。
-
public
class LineRecordReader extends RecordReader<</span>LongWritable, Text> { -
private static final Log LOG = LogFactory.getLog(LineRecordReader.class); -
-
private CompressionCodecFactory compressionCodecs = null; -
private long start; -
private long pos; -
private long end; -
private LineReader in; -
private int maxLineLength; -
private LongWritable key = null; -
private Text value = null; -
-
//我们知道LineRecordReader是读取一个InputSplit的,它从InputSplit中不断以其定义的格式读取K-V对 -
//initialize函数主要是计算分片的始末位置,以及打开想要的输入流以供读取K-V对,输入流另外处理分片经过压缩的情况 -
public void initialize(InputSplit genericSplit, -
TaskAttemptContext context) throws IOException { -
FileSplit split = (FileSplit) genericSplit; -
Configuration job = context.getConfiguration(); -
this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", -
Integer.MAX_VALUE); -
start = split.getStart(); -
end = start + split.getLength(); -
final Path file = split.getPath(); -
compressionCodecs = new CompressionCodecFactory(job); -
final CompressionCodec codec = compressionCodecs.getCodec(file); -
-
// open the file and seek to the start of the split -
FileSystem fs = file.getFileSystem(job); -
FSDataInputStream fileIn = fs.open(split.getPath()); -
boolean skipFirstLine = false; -
if (codec != null) { -
in = new LineReader(codec.createInputStream(fileIn), job); -
end = Long.MAX_VALUE; -
} else { -
if (start != 0) { -
skipFirstLine = true; -
--start; -
fileIn.seek(start); -
} -
in = new LineReader(fileIn, job); -
} -
if (skipFirstLine) { // skip first line and re-establish "start". -
start += in.readLine(new Text(), 0, -
(int)Math.min((long)Integer.MAX_VALUE, end - start)); -
} -
this.pos = start; -
} -
-
public boolean nextKeyValue() throws IOException { -
if (key == null) { -
key = new LongWritable(); -
} -
key.set(pos); //对于LineRecordReader来说,它以偏移值为key,以一行为value -
if (value == null) { -
value = new Text(); -
} -
int newSize = 0; -
while (pos <</span> end) { -
newSize = in.readLine(value, maxLineLength, -
Math.max((int)Math.min(Integer.MAX_VALUE, end-pos), -
maxLineLength)); -
if (newSize == 0) { -
break; -
} -
pos += newSize; -
if (newSize <</span> maxLineLength) { -
break; -
} -
-
// line too long. try again -
LOG.info("Skipped line of size " + newSize + " at pos " + -
(pos - newSize)); -
} -
if (newSize == 0) { -
key = null; -
value = null; -
return false; -
} else { -
return true; -
} -
} -
-
@Override -
public LongWritable getCurrentKey() { -
return key; -
} -
-
@Override -
public Text getCurrentValue() { -
return value; -
} -
-
-
public float getProgress() { -
if (start == end) { -
return 0.0f; -
} else { -
return Math.min(1.0f, (pos - start) / (float)(end - start));//读取进度由已读取InputSplit大小比总InputSplit大小 -
} -
} -
-
public synchronized void close() throws IOException { -
if (in != null) { -
in.close(); -
} -
} - }