RCFile作为一种特殊的SequenceFile,Map/Reduce使用RCFile作为输入,需要实现SequenceFileInputFormat的RecordReader方法。
针对hadoop新接口的RecordReader个人实现如下:
package com.unimas.hyl.mr;
import java.io.IOException;
import java.util.Map;
import java.util.Collections;
import java.util.WeakHashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.hive.conf.HiveConf;
public class RCFileRecordReader extends
RecordReader<LongWritable, BytesRefArrayWritable> {
private RCFile.Reader in;
private long start;
private long end;
private boolean more = true;
protected Configuration conf;
private FileSplit split;
private boolean useCache;
private LongWritable key;
private BytesRefArrayWritable value;
private static RCFileSyncCache syncCache = new RCFileSyncCache();
private static final class RCFileSyncEntry {
long end;
long endSync;
}
private static final class RCFileSyncCache {
private final Map<String, RCFileSyncEntry> cache;
public RCFileSyncCache() {
cache = Collections
.synchronizedMap(new WeakHashMap<String, RCFileSyncEntry>());
}
public void put(FileSplit split, long endSync) {
Path path = split.getPath();
long end = split.getStart() + split.getLength();
String key = path.toString() + "+" + String.format("%d", end);
RCFileSyncEntry entry = new RCFileSyncEntry();
entry.end = end;
entry.endSync = endSync;
if (entry.endSync >= entry.end) {
cache.put(key, entry);
}
}
public long get(FileSplit split) {
Path path = split.getPath();
long start = split.getStart();
String key = path.toString() + "+" + String.format("%d", start);
RCFileSyncEntry entry = cache.get(key);
if (entry != null) {
return entry.endSync;
}
return -1;
}
}
public RCFileRecordReader() {
}
protected boolean next() throws IOException {
if (!more) {
return false;
}
more = in.next(key);
long lastSeenSyncPos = in.lastSeenSyncPos();
if (lastSeenSyncPos >= end) {
if (useCache) {
syncCache.put(split, lastSeenSyncPos);
}
more = false;
return more;
}
in.getCurrentRow(value);
return more;
}
/**
* Return the progress within the input split.
*
* @return 0.0 to 1.0 of the input byte range
*/
@Override
public float getProgress() throws IOException {
if (end == start) {
return 0.0f;
} else {
return Math.min(1.0f, (in.getPosition() - start)
/ (float) (end - start));
}
}
@Override
public void close() throws IOException {
in.close();
}
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
this.conf = context.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
this.in = new RCFile.Reader(fs, path, conf);
this.end = split.getStart() + split.getLength();
this.split = split;
key = new LongWritable();
value = new BytesRefArrayWritable();
useCache = HiveConf.getBoolVar(conf,
HiveConf.ConfVars.HIVEUSERCFILESYNCCACHE);
if (split.getStart() > in.getPosition()) {
long oldSync = useCache ? syncCache.get(split) : -1;
if (oldSync == -1) {
in.sync(split.getStart()); // sync to start
} else {
in.seek(oldSync);
}
}
this.start = in.getPosition();
more = start < end;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return next();
}
@Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
@Override
public BytesRefArrayWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
}
}
对于RCFile的分片同sequencefile/普通文件,这样InputFormat就有了。