Mapreduce RCFile写入和读取API示例

最新推荐文章于 2024-08-09 00:51:26 发布

红豆和绿豆

最新推荐文章于 2024-08-09 00:51:26 发布

阅读量1.2k

点赞数

分类专栏： hadoop 文章标签： hadoop mapreduce hive

hadoop 专栏收录该内容

94 篇文章

订阅专栏

本文介绍如何使用MapReduce处理RCFile格式数据，包括从文本文件生成RCFile文件及从RCFile文件转为文本文件的过程。提供了详细的代码示例，涵盖配置、输入输出格式设置等关键步骤。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

RCFile是FaceBook开发的高压缩比、高效读的行列存储结构。通常在Hive中可以直接对一张Text表使用insert-select转换，但有时希望使用Mapreduce进行RCFile的读写。

maven依赖

[html]view plaincopy 
   
  <dependency>  
      <groupId>org.apache.hadoop</groupId>  
      <artifactId>hadoop-client</artifactId>  
      <version>2.5.0-cdh5.2.1</version>  
  </dependency>  
   
  <dependency>  
      <groupId>org.apache.hive</groupId>  
      <artifactId>hive-serde</artifactId>  
      <version>0.13.1-cdh5.2.1</version>  
  </dependency>  
   
  <dependency>  
      <groupId>org.apache.hive.hcatalog</groupId>  
      <artifactId>hive-hcatalog-core</artifactId>  
     <version>0.13.1-cdh5.2.1</version>  
 </dependency>  

读取文本文件，使用mapreduce生成RCFile格式文件

[java]view plaincopy 
   
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;  
 import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;  
 import org.apache.hadoop.io.NullWritable;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.mapreduce.Job;  
 import org.apache.hadoop.mapreduce.Mapper;  
 import org.apache.hadoop.mapreduce.Reducer;  
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
 import org.apache.hive.hcatalog.rcfile.RCFileMapReduceInputFormat;  
   
 import java.io.IOException;  
   
 public class RcFileReaderJob {  
     static class RcFileMapper extends Mapper<Object, BytesRefArrayWritable, Text, NullWritable> {  
         @Override  
         protected void map(Object key, BytesRefArrayWritable value,  
                            Context context)  
                 throws IOException, InterruptedException {  
             Text txt = new Text();  
             StringBuffer sb = new StringBuffer();  
             for (int i = 0; i < value.size(); i++) {  
                 BytesRefWritable v = value.get(i);  
                 txt.set(v.getData(), v.getStart(), v.getLength());  
                 if (i == value.size() - 1) {  
                     sb.append(txt.toString());  
                 } else {  
                     sb.append(txt.toString() + "\t");  
                 }  
             }  
             context.write(new Text(sb.toString()), NullWritable.get());  
         }  
   
         @Override  
         protected void cleanup(Context context) throws IOException,  
                 InterruptedException {  
             super.cleanup(context);  
         }  
   
         @Override  
         protected void setup(Context context) throws IOException,  
                 InterruptedException {  
             super.setup(context);  
   
         }  
     }  
   
     static class RcFileReduce extends Reducer<Text, NullWritable, Text, NullWritable> {  
         @Override  
         protected void reduce(Text key, Iterable<NullWritable> values,  
                               Context context) throws IOException, InterruptedException {  
             context.write(key, NullWritable.get());  
         }  
     }  
   
     public static boolean runLoadMapReducue(Configuration conf, Path input, Path output) throws IOException,  
             ClassNotFoundException, InterruptedException {  
         Job job = Job.getInstance(conf);  
         job.setJarByClass(RcFileReaderJob.class);  
         job.setJobName("RcFileReaderJob");  
         job.setNumReduceTasks(1);  
         job.setMapperClass(RcFileMapper.class);  
         job.setReducerClass(RcFileReduce.class);  
         job.setInputFormatClass(RCFileMapReduceInputFormat.class);  
 //        MultipleInputs.addInputPath(job, input, RCFileInputFormat.class);  
         RCFileMapReduceInputFormat.addInputPath(job, input);  
         job.setOutputKeyClass(Text.class);  
         job.setOutputValueClass(NullWritable.class);  
         FileOutputFormat.setOutputPath(job, output);  
         return job.waitForCompletion(true);  
     }  
   
     public static void main(String[] args) throws Exception {  
         Configuration conf = new Configuration();  
         if (args.length != 2) {  
             System.err.println("Usage: rcfile <in> <out>");  
             System.exit(2);  
         }  
         RcFileReaderJob.runLoadMapReducue(conf, new Path(args[0]), new Path(args[1]));  
     }  
 }    

读取RCFile格式文件，使用mapreduce生成Text格式文件

[java]view plaincopy 
   
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.conf.Configured;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;  
 import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;  
 import org.apache.hadoop.io.NullWritable;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.mapreduce.Job;  
 import org.apache.hadoop.mapreduce.Mapper;  
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
 import org.apache.hadoop.util.GenericOptionsParser;  
 import org.apache.hadoop.util.Tool;  
 import org.apache.hadoop.util.ToolRunner;  
 import org.apache.hive.hcatalog.rcfile.RCFileMapReduceOutputFormat;  
   
 import java.io.IOException;  
   
 public class RcFileWriterJob extends Configured implements Tool{  
     public static class Map extends Mapper<Object, Text, NullWritable, BytesRefArrayWritable>{  
         private byte[] fieldData;  
         private int numCols;  
         private BytesRefArrayWritable bytes;  
           
         @Override  
         protected void setup(Context context) throws IOException, InterruptedException {  
             numCols = context.getConfiguration().getInt("hive.io.rcfile.column.number.conf", 0);  
             bytes = new BytesRefArrayWritable(numCols);  
         }  
           
         public void map(Object key, Text line, Context context  
                 ) throws IOException, InterruptedException {  
             bytes.clear();  
             String[] cols = line.toString().split("\t", -1);  
             System.out.println("SIZE : "+cols.length);  
             for (int i=0; i<numCols; i++){  
                 fieldData = cols[i].getBytes("UTF-8");  
                 BytesRefWritable cu = new BytesRefWritable(fieldData, 0, fieldData.length);  
                 bytes.set(i, cu);  
             }  
             context.write(NullWritable.get(), bytes);  
         }  
     }  
       
     public int run(String[] args) throws Exception {  
         Configuration conf = new Configuration();  
         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
         if(otherArgs.length < 2){  
             System.out.println("Usage: " +  
                     "hadoop jar RCFileLoader.jar <main class> " +  
                     "-tableName <tableName> -numCols <numberOfColumns> -input <input path> " +  
                     "-output <output path> -rowGroupSize <rowGroupSize> -ioBufferSize <ioBufferSize>");  
             System.out.println("For test");  
             System.out.println("$HADOOP jar RCFileLoader.jar edu.osu.cse.rsam.rcfile.mapreduce.LoadTable " +  
                     "-tableName test1 -numCols 10 -input RCFileLoaderTest/test1 " +  
                     "-output RCFileLoaderTest/RCFile_test1");  
             System.out.println("$HADOOP jar RCFileLoader.jar edu.osu.cse.rsam.rcfile.mapreduce.LoadTable " +  
                     "-tableName test2 -numCols 5 -input RCFileLoaderTest/test2 " +  
                     "-output RCFileLoaderTest/RCFile_test2");  
             return 2;  
         }  
   
         String tableName = "";  
         int numCols = 0;  
         String inputPath = "";  
         String outputPath = "";  
         int rowGroupSize = 16 *1024*1024;  
         int ioBufferSize = 128*1024;  
         for (int i=0; i<otherArgs.length - 1; i++){  
             if("-tableName".equals(otherArgs[i])){  
                 tableName = otherArgs[i+1];  
             }else if ("-numCols".equals(otherArgs[i])){  
                 numCols = Integer.parseInt(otherArgs[i+1]);  
             }else if ("-input".equals(otherArgs[i])){  
                 inputPath = otherArgs[i+1];  
             }else if("-output".equals(otherArgs[i])){  
                 outputPath = otherArgs[i+1];  
             }else if("-rowGroupSize".equals(otherArgs[i])){  
                 rowGroupSize = Integer.parseInt(otherArgs[i+1]);  
             }else if("-ioBufferSize".equals(otherArgs[i])){  
                 ioBufferSize = Integer.parseInt(otherArgs[i+1]);  
             }  
               
         }  
           
         conf.setInt("hive.io.rcfile.record.buffer.size", rowGroupSize);  
         conf.setInt("io.file.buffer.size", ioBufferSize);  
   
         Job job = Job.getInstance(conf);  
         job.setJobName("RcFileWriterJob");  
         job.setJarByClass(RcFileWriterJob.class);  
         job.setMapperClass(Map.class);  
         job.setMapOutputKeyClass(NullWritable.class);  
         job.setMapOutputValueClass(BytesRefArrayWritable.class);  
 //      job.setNumReduceTasks(0);  
           
         FileInputFormat.addInputPath(job, new Path(inputPath));  
           
         job.setOutputFormatClass(RCFileMapReduceOutputFormat.class);  
         RCFileMapReduceOutputFormat.setColumnNumber(job.getConfiguration(), numCols);  
         RCFileMapReduceOutputFormat.setOutputPath(job, new Path(outputPath));  
         RCFileMapReduceOutputFormat.setCompressOutput(job, false);  
   
         System.out.println("Loading table " + tableName + " from " + inputPath + " to RCFile located at " + outputPath);  
         System.out.println("number of columns:" + job.getConfiguration().get("hive.io.rcfile.column.number.conf"));  
         System.out.println("RCFile row group size:" + job.getConfiguration().get("hive.io.rcfile.record.buffer.size"));  
         System.out.println("io bufer size:" + job.getConfiguration().get("io.file.buffer.size"));  
           
         return (job.waitForCompletion(true) ? 0 : 1);  
     }  
       
     public static void main(String[] args) throws Exception {  
         int res = ToolRunner.run(new Configuration(), new RcFileWriterJob(), args);  
         System.exit(res);  
     }  
   
 }