Mapreduce RCFile写入和读取API示例

本文介绍如何使用MapReduce处理RCFile格式数据,包括从文本文件生成RCFile文件及从RCFile文件转为文本文件的过程。提供了详细的代码示例,涵盖配置、输入输出格式设置等关键步骤。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

RCFile是FaceBook开发的高压缩比、高效读的行存储结构。通常在Hive中可以直接对一张Text表使用insert-select转换,但有时希望使用Mapreduce进行RCFile的读写。


maven依赖

[html]  view plain copy
  1.  <dependency>  
  2.      <groupId>org.apache.hadoop</groupId>  
  3.      <artifactId>hadoop-client</artifactId>  
  4.      <version>2.5.0-cdh5.2.1</version>  
  5.  </dependency>  
  6.   
  7.  <dependency>  
  8.      <groupId>org.apache.hive</groupId>  
  9.      <artifactId>hive-serde</artifactId>  
  10.      <version>0.13.1-cdh5.2.1</version>  
  11.  </dependency>  
  12.   
  13.  <dependency>  
  14.      <groupId>org.apache.hive.hcatalog</groupId>  
  15.      <artifactId>hive-hcatalog-core</artifactId>  
  16.     <version>0.13.1-cdh5.2.1</version>  
  17. </dependency>  



读取文本文件,使用mapreduce生成RCFile格式文件

[java]  view plain copy
  1. import org.apache.hadoop.conf.Configuration;  
  2. import org.apache.hadoop.fs.Path;  
  3. import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;  
  4. import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;  
  5. import org.apache.hadoop.io.NullWritable;  
  6. import org.apache.hadoop.io.Text;  
  7. import org.apache.hadoop.mapreduce.Job;  
  8. import org.apache.hadoop.mapreduce.Mapper;  
  9. import org.apache.hadoop.mapreduce.Reducer;  
  10. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  11. import org.apache.hive.hcatalog.rcfile.RCFileMapReduceInputFormat;  
  12.   
  13. import java.io.IOException;  
  14.   
  15. public class RcFileReaderJob {  
  16.     static class RcFileMapper extends Mapper<Object, BytesRefArrayWritable, Text, NullWritable> {  
  17.         @Override  
  18.         protected void map(Object key, BytesRefArrayWritable value,  
  19.                            Context context)  
  20.                 throws IOException, InterruptedException {  
  21.             Text txt = new Text();  
  22.             StringBuffer sb = new StringBuffer();  
  23.             for (int i = 0; i < value.size(); i++) {  
  24.                 BytesRefWritable v = value.get(i);  
  25.                 txt.set(v.getData(), v.getStart(), v.getLength());  
  26.                 if (i == value.size() - 1) {  
  27.                     sb.append(txt.toString());  
  28.                 } else {  
  29.                     sb.append(txt.toString() + "\t");  
  30.                 }  
  31.             }  
  32.             context.write(new Text(sb.toString()), NullWritable.get());  
  33.         }  
  34.   
  35.         @Override  
  36.         protected void cleanup(Context context) throws IOException,  
  37.                 InterruptedException {  
  38.             super.cleanup(context);  
  39.         }  
  40.   
  41.         @Override  
  42.         protected void setup(Context context) throws IOException,  
  43.                 InterruptedException {  
  44.             super.setup(context);  
  45.   
  46.         }  
  47.     }  
  48.   
  49.     static class RcFileReduce extends Reducer<Text, NullWritable, Text, NullWritable> {  
  50.         @Override  
  51.         protected void reduce(Text key, Iterable<NullWritable> values,  
  52.                               Context context) throws IOException, InterruptedException {  
  53.             context.write(key, NullWritable.get());  
  54.         }  
  55.     }  
  56.   
  57.     public static boolean runLoadMapReducue(Configuration conf, Path input, Path output) throws IOException,  
  58.             ClassNotFoundException, InterruptedException {  
  59.         Job job = Job.getInstance(conf);  
  60.         job.setJarByClass(RcFileReaderJob.class);  
  61.         job.setJobName("RcFileReaderJob");  
  62.         job.setNumReduceTasks(1);  
  63.         job.setMapperClass(RcFileMapper.class);  
  64.         job.setReducerClass(RcFileReduce.class);  
  65.         job.setInputFormatClass(RCFileMapReduceInputFormat.class);  
  66. //        MultipleInputs.addInputPath(job, input, RCFileInputFormat.class);  
  67.         RCFileMapReduceInputFormat.addInputPath(job, input);  
  68.         job.setOutputKeyClass(Text.class);  
  69.         job.setOutputValueClass(NullWritable.class);  
  70.         FileOutputFormat.setOutputPath(job, output);  
  71.         return job.waitForCompletion(true);  
  72.     }  
  73.   
  74.     public static void main(String[] args) throws Exception {  
  75.         Configuration conf = new Configuration();  
  76.         if (args.length != 2) {  
  77.             System.err.println("Usage: rcfile <in> <out>");  
  78.             System.exit(2);  
  79.         }  
  80.         RcFileReaderJob.runLoadMapReducue(conf, new Path(args[0]), new Path(args[1]));  
  81.     }  
  82. }    


读取RCFile格式文件,使用mapreduce生成Text格式文件

[java]  view plain copy
  1. import org.apache.hadoop.conf.Configuration;  
  2. import org.apache.hadoop.conf.Configured;  
  3. import org.apache.hadoop.fs.Path;  
  4. import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;  
  5. import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;  
  6. import org.apache.hadoop.io.NullWritable;  
  7. import org.apache.hadoop.io.Text;  
  8. import org.apache.hadoop.mapreduce.Job;  
  9. import org.apache.hadoop.mapreduce.Mapper;  
  10. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  11. import org.apache.hadoop.util.GenericOptionsParser;  
  12. import org.apache.hadoop.util.Tool;  
  13. import org.apache.hadoop.util.ToolRunner;  
  14. import org.apache.hive.hcatalog.rcfile.RCFileMapReduceOutputFormat;  
  15.   
  16. import java.io.IOException;  
  17.   
  18. public class RcFileWriterJob extends Configured implements Tool{  
  19.     public static class Map extends Mapper<Object, Text, NullWritable, BytesRefArrayWritable>{  
  20.         private byte[] fieldData;  
  21.         private int numCols;  
  22.         private BytesRefArrayWritable bytes;  
  23.           
  24.         @Override  
  25.         protected void setup(Context context) throws IOException, InterruptedException {  
  26.             numCols = context.getConfiguration().getInt("hive.io.rcfile.column.number.conf"0);  
  27.             bytes = new BytesRefArrayWritable(numCols);  
  28.         }  
  29.           
  30.         public void map(Object key, Text line, Context context  
  31.                 ) throws IOException, InterruptedException {  
  32.             bytes.clear();  
  33.             String[] cols = line.toString().split("\t", -1);  
  34.             System.out.println("SIZE : "+cols.length);  
  35.             for (int i=0; i<numCols; i++){  
  36.                 fieldData = cols[i].getBytes("UTF-8");  
  37.                 BytesRefWritable cu = new BytesRefWritable(fieldData, 0, fieldData.length);  
  38.                 bytes.set(i, cu);  
  39.             }  
  40.             context.write(NullWritable.get(), bytes);  
  41.         }  
  42.     }  
  43.       
  44.     public int run(String[] args) throws Exception {  
  45.         Configuration conf = new Configuration();  
  46.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
  47.         if(otherArgs.length < 2){  
  48.             System.out.println("Usage: " +  
  49.                     "hadoop jar RCFileLoader.jar <main class> " +  
  50.                     "-tableName <tableName> -numCols <numberOfColumns> -input <input path> " +  
  51.                     "-output <output path> -rowGroupSize <rowGroupSize> -ioBufferSize <ioBufferSize>");  
  52.             System.out.println("For test");  
  53.             System.out.println("$HADOOP jar RCFileLoader.jar edu.osu.cse.rsam.rcfile.mapreduce.LoadTable " +  
  54.                     "-tableName test1 -numCols 10 -input RCFileLoaderTest/test1 " +  
  55.                     "-output RCFileLoaderTest/RCFile_test1");  
  56.             System.out.println("$HADOOP jar RCFileLoader.jar edu.osu.cse.rsam.rcfile.mapreduce.LoadTable " +  
  57.                     "-tableName test2 -numCols 5 -input RCFileLoaderTest/test2 " +  
  58.                     "-output RCFileLoaderTest/RCFile_test2");  
  59.             return 2;  
  60.         }  
  61.   
  62.         String tableName = "";  
  63.         int numCols = 0;  
  64.         String inputPath = "";  
  65.         String outputPath = "";  
  66.         int rowGroupSize = 16 *1024*1024;  
  67.         int ioBufferSize = 128*1024;  
  68.         for (int i=0; i<otherArgs.length - 1; i++){  
  69.             if("-tableName".equals(otherArgs[i])){  
  70.                 tableName = otherArgs[i+1];  
  71.             }else if ("-numCols".equals(otherArgs[i])){  
  72.                 numCols = Integer.parseInt(otherArgs[i+1]);  
  73.             }else if ("-input".equals(otherArgs[i])){  
  74.                 inputPath = otherArgs[i+1];  
  75.             }else if("-output".equals(otherArgs[i])){  
  76.                 outputPath = otherArgs[i+1];  
  77.             }else if("-rowGroupSize".equals(otherArgs[i])){  
  78.                 rowGroupSize = Integer.parseInt(otherArgs[i+1]);  
  79.             }else if("-ioBufferSize".equals(otherArgs[i])){  
  80.                 ioBufferSize = Integer.parseInt(otherArgs[i+1]);  
  81.             }  
  82.               
  83.         }  
  84.           
  85.         conf.setInt("hive.io.rcfile.record.buffer.size", rowGroupSize);  
  86.         conf.setInt("io.file.buffer.size", ioBufferSize);  
  87.   
  88.         Job job = Job.getInstance(conf);  
  89.         job.setJobName("RcFileWriterJob");  
  90.         job.setJarByClass(RcFileWriterJob.class);  
  91.         job.setMapperClass(Map.class);  
  92.         job.setMapOutputKeyClass(NullWritable.class);  
  93.         job.setMapOutputValueClass(BytesRefArrayWritable.class);  
  94. //      job.setNumReduceTasks(0);  
  95.           
  96.         FileInputFormat.addInputPath(job, new Path(inputPath));  
  97.           
  98.         job.setOutputFormatClass(RCFileMapReduceOutputFormat.class);  
  99.         RCFileMapReduceOutputFormat.setColumnNumber(job.getConfiguration(), numCols);  
  100.         RCFileMapReduceOutputFormat.setOutputPath(job, new Path(outputPath));  
  101.         RCFileMapReduceOutputFormat.setCompressOutput(job, false);  
  102.   
  103.         System.out.println("Loading table " + tableName + " from " + inputPath + " to RCFile located at " + outputPath);  
  104.         System.out.println("number of columns:" + job.getConfiguration().get("hive.io.rcfile.column.number.conf"));  
  105.         System.out.println("RCFile row group size:" + job.getConfiguration().get("hive.io.rcfile.record.buffer.size"));  
  106.         System.out.println("io bufer size:" + job.getConfiguration().get("io.file.buffer.size"));  
  107.           
  108.         return (job.waitForCompletion(true) ? 0 : 1);  
  109.     }  
  110.       
  111.     public static void main(String[] args) throws Exception {  
  112.         int res = ToolRunner.run(new Configuration(), new RcFileWriterJob(), args);  
  113.         System.exit(res);  
  114.     }  
  115.   
  116. }  
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值