RCFile是FaceBook开发的高压缩比、高效读的行列存储结构。通常在Hive中可以直接对一张Text表使用insert-select转换,但有时希望使用Mapreduce进行RCFile的读写。
maven依赖
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <version>2.5.0-cdh5.2.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hive</groupId>
- <artifactId>hive-serde</artifactId>
- <version>0.13.1-cdh5.2.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hive.hcatalog</groupId>
- <artifactId>hive-hcatalog-core</artifactId>
- <version>0.13.1-cdh5.2.1</version>
- </dependency>
读取文本文件,使用mapreduce生成RCFile格式文件
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
- import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hive.hcatalog.rcfile.RCFileMapReduceInputFormat;
- import java.io.IOException;
- public class RcFileReaderJob {
- static class RcFileMapper extends Mapper<Object, BytesRefArrayWritable, Text, NullWritable> {
- @Override
- protected void map(Object key, BytesRefArrayWritable value,
- Context context)
- throws IOException, InterruptedException {
- Text txt = new Text();
- StringBuffer sb = new StringBuffer();
- for (int i = 0; i < value.size(); i++) {
- BytesRefWritable v = value.get(i);
- txt.set(v.getData(), v.getStart(), v.getLength());
- if (i == value.size() - 1) {
- sb.append(txt.toString());
- } else {
- sb.append(txt.toString() + "\t");
- }
- }
- context.write(new Text(sb.toString()), NullWritable.get());
- }
- @Override
- protected void cleanup(Context context) throws IOException,
- InterruptedException {
- super.cleanup(context);
- }
- @Override
- protected void setup(Context context) throws IOException,
- InterruptedException {
- super.setup(context);
- }
- }
- static class RcFileReduce extends Reducer<Text, NullWritable, Text, NullWritable> {
- @Override
- protected void reduce(Text key, Iterable<NullWritable> values,
- Context context) throws IOException, InterruptedException {
- context.write(key, NullWritable.get());
- }
- }
- public static boolean runLoadMapReducue(Configuration conf, Path input, Path output) throws IOException,
- ClassNotFoundException, InterruptedException {
- Job job = Job.getInstance(conf);
- job.setJarByClass(RcFileReaderJob.class);
- job.setJobName("RcFileReaderJob");
- job.setNumReduceTasks(1);
- job.setMapperClass(RcFileMapper.class);
- job.setReducerClass(RcFileReduce.class);
- job.setInputFormatClass(RCFileMapReduceInputFormat.class);
- // MultipleInputs.addInputPath(job, input, RCFileInputFormat.class);
- RCFileMapReduceInputFormat.addInputPath(job, input);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(NullWritable.class);
- FileOutputFormat.setOutputPath(job, output);
- return job.waitForCompletion(true);
- }
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
- if (args.length != 2) {
- System.err.println("Usage: rcfile <in> <out>");
- System.exit(2);
- }
- RcFileReaderJob.runLoadMapReducue(conf, new Path(args[0]), new Path(args[1]));
- }
- }
读取RCFile格式文件,使用mapreduce生成Text格式文件
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.conf.Configured;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
- import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- import org.apache.hadoop.util.Tool;
- import org.apache.hadoop.util.ToolRunner;
- import org.apache.hive.hcatalog.rcfile.RCFileMapReduceOutputFormat;
- import java.io.IOException;
- public class RcFileWriterJob extends Configured implements Tool{
- public static class Map extends Mapper<Object, Text, NullWritable, BytesRefArrayWritable>{
- private byte[] fieldData;
- private int numCols;
- private BytesRefArrayWritable bytes;
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- numCols = context.getConfiguration().getInt("hive.io.rcfile.column.number.conf", 0);
- bytes = new BytesRefArrayWritable(numCols);
- }
- public void map(Object key, Text line, Context context
- ) throws IOException, InterruptedException {
- bytes.clear();
- String[] cols = line.toString().split("\t", -1);
- System.out.println("SIZE : "+cols.length);
- for (int i=0; i<numCols; i++){
- fieldData = cols[i].getBytes("UTF-8");
- BytesRefWritable cu = new BytesRefWritable(fieldData, 0, fieldData.length);
- bytes.set(i, cu);
- }
- context.write(NullWritable.get(), bytes);
- }
- }
- public int run(String[] args) throws Exception {
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
- if(otherArgs.length < 2){
- System.out.println("Usage: " +
- "hadoop jar RCFileLoader.jar <main class> " +
- "-tableName <tableName> -numCols <numberOfColumns> -input <input path> " +
- "-output <output path> -rowGroupSize <rowGroupSize> -ioBufferSize <ioBufferSize>");
- System.out.println("For test");
- System.out.println("$HADOOP jar RCFileLoader.jar edu.osu.cse.rsam.rcfile.mapreduce.LoadTable " +
- "-tableName test1 -numCols 10 -input RCFileLoaderTest/test1 " +
- "-output RCFileLoaderTest/RCFile_test1");
- System.out.println("$HADOOP jar RCFileLoader.jar edu.osu.cse.rsam.rcfile.mapreduce.LoadTable " +
- "-tableName test2 -numCols 5 -input RCFileLoaderTest/test2 " +
- "-output RCFileLoaderTest/RCFile_test2");
- return 2;
- }
- String tableName = "";
- int numCols = 0;
- String inputPath = "";
- String outputPath = "";
- int rowGroupSize = 16 *1024*1024;
- int ioBufferSize = 128*1024;
- for (int i=0; i<otherArgs.length - 1; i++){
- if("-tableName".equals(otherArgs[i])){
- tableName = otherArgs[i+1];
- }else if ("-numCols".equals(otherArgs[i])){
- numCols = Integer.parseInt(otherArgs[i+1]);
- }else if ("-input".equals(otherArgs[i])){
- inputPath = otherArgs[i+1];
- }else if("-output".equals(otherArgs[i])){
- outputPath = otherArgs[i+1];
- }else if("-rowGroupSize".equals(otherArgs[i])){
- rowGroupSize = Integer.parseInt(otherArgs[i+1]);
- }else if("-ioBufferSize".equals(otherArgs[i])){
- ioBufferSize = Integer.parseInt(otherArgs[i+1]);
- }
- }
- conf.setInt("hive.io.rcfile.record.buffer.size", rowGroupSize);
- conf.setInt("io.file.buffer.size", ioBufferSize);
- Job job = Job.getInstance(conf);
- job.setJobName("RcFileWriterJob");
- job.setJarByClass(RcFileWriterJob.class);
- job.setMapperClass(Map.class);
- job.setMapOutputKeyClass(NullWritable.class);
- job.setMapOutputValueClass(BytesRefArrayWritable.class);
- // job.setNumReduceTasks(0);
- FileInputFormat.addInputPath(job, new Path(inputPath));
- job.setOutputFormatClass(RCFileMapReduceOutputFormat.class);
- RCFileMapReduceOutputFormat.setColumnNumber(job.getConfiguration(), numCols);
- RCFileMapReduceOutputFormat.setOutputPath(job, new Path(outputPath));
- RCFileMapReduceOutputFormat.setCompressOutput(job, false);
- System.out.println("Loading table " + tableName + " from " + inputPath + " to RCFile located at " + outputPath);
- System.out.println("number of columns:" + job.getConfiguration().get("hive.io.rcfile.column.number.conf"));
- System.out.println("RCFile row group size:" + job.getConfiguration().get("hive.io.rcfile.record.buffer.size"));
- System.out.println("io bufer size:" + job.getConfiguration().get("io.file.buffer.size"));
- return (job.waitForCompletion(true) ? 0 : 1);
- }
- public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(new Configuration(), new RcFileWriterJob(), args);
- System.exit(res);
- }
- }