Map/Reduce操作RCFile的RecordReader

最新推荐文章于 2022-04-21 08:14:53 发布

原创最新推荐文章于 2022-04-21 08:14:53 发布 · 1.1k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#java #rcfile #hadoop

java 同时被 2 个专栏收录

6 篇文章

订阅专栏

hadoop

2 篇文章

订阅专栏

RCFile作为一种特殊的SequenceFile,Map/Reduce使用RCFile作为输入，需要实现SequenceFileInputFormat的RecordReader方法。

针对hadoop新接口的RecordReader个人实现如下：

package com.unimas.hyl.mr;

import java.io.IOException;
import java.util.Map;
import java.util.Collections;
import java.util.WeakHashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.hive.conf.HiveConf;

public class RCFileRecordReader extends
		RecordReader<LongWritable, BytesRefArrayWritable> {

	private RCFile.Reader in;
	private long start;
	private long end;
	private boolean more = true;
	protected Configuration conf;
	private FileSplit split;
	private boolean useCache;
	private LongWritable key;
	private BytesRefArrayWritable value;

	private static RCFileSyncCache syncCache = new RCFileSyncCache();

	private static final class RCFileSyncEntry {
		long end;
		long endSync;
	}

	private static final class RCFileSyncCache {

		private final Map<String, RCFileSyncEntry> cache;

		public RCFileSyncCache() {
			cache = Collections
					.synchronizedMap(new WeakHashMap<String, RCFileSyncEntry>());
		}

		public void put(FileSplit split, long endSync) {
			Path path = split.getPath();
			long end = split.getStart() + split.getLength();
			String key = path.toString() + "+" + String.format("%d", end);

			RCFileSyncEntry entry = new RCFileSyncEntry();
			entry.end = end;
			entry.endSync = endSync;
			if (entry.endSync >= entry.end) {
				cache.put(key, entry);
			}
		}

		public long get(FileSplit split) {
			Path path = split.getPath();
			long start = split.getStart();
			String key = path.toString() + "+" + String.format("%d", start);
			RCFileSyncEntry entry = cache.get(key);
			if (entry != null) {
				return entry.endSync;
			}
			return -1;
		}
	}

	public RCFileRecordReader() {
	}

	protected boolean next() throws IOException {
		if (!more) {
			return false;
		}
		more = in.next(key);
		long lastSeenSyncPos = in.lastSeenSyncPos();

		if (lastSeenSyncPos >= end) {
			if (useCache) {
				syncCache.put(split, lastSeenSyncPos);
			}
			more = false;
			return more;
		}
		in.getCurrentRow(value);
		return more;
	}

	/**
	 * Return the progress within the input split.
	 * 
	 * @return 0.0 to 1.0 of the input byte range
	 */
	@Override
	public float getProgress() throws IOException {
		if (end == start) {
			return 0.0f;
		} else {
			return Math.min(1.0f, (in.getPosition() - start)
					/ (float) (end - start));
		}
	}

	@Override
	public void close() throws IOException {
		in.close();
	}

	@Override
	public void initialize(InputSplit genericSplit, TaskAttemptContext context)
			throws IOException, InterruptedException {
		FileSplit split = (FileSplit) genericSplit;
		this.conf = context.getConfiguration();
		Path path = split.getPath();
		FileSystem fs = path.getFileSystem(conf);
		this.in = new RCFile.Reader(fs, path, conf);
		this.end = split.getStart() + split.getLength();
		this.split = split;
		key = new LongWritable();
		value = new BytesRefArrayWritable();

		useCache = HiveConf.getBoolVar(conf,
				HiveConf.ConfVars.HIVEUSERCFILESYNCCACHE);

		if (split.getStart() > in.getPosition()) {
			long oldSync = useCache ? syncCache.get(split) : -1;
			if (oldSync == -1) {
				in.sync(split.getStart()); // sync to start
			} else {
				in.seek(oldSync);
			}
		}
		this.start = in.getPosition();
		more = start < end;
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		return next();
	}

	@Override
	public LongWritable getCurrentKey() throws IOException,
			InterruptedException {
		return key;
	}

	@Override
	public BytesRefArrayWritable getCurrentValue() throws IOException,
			InterruptedException {
		return value;
	}

}

对于RCFile的分片同sequencefile/普通文件，这样InputFormat就有了。