hadoop 自定义inputformat和outputformat

最新推荐文章于 2024-01-01 20:22:46 发布

转载最新推荐文章于 2024-01-01 20:22:46 发布 · 679 阅读

hadoop 专栏收录该内容

76 篇文章

订阅专栏

本文详细介绍了Hadoop中InputFormat与OutputFormat的工作原理，并通过具体示例展示了如何自定义这两种格式来处理MapReduce任务的数据输入和输出。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

http://blackproof.iteye.com/blog/1

hadoop的inputformat和outputformat

最好的例子vertica ：虽然是在pig中实现的udf，但是就是hadoop的inputformat和outputformat，在hive里也可以照用，贴个下载的地址：http://blackproof.iteye.com/blog/1791995

再贴一个项目中，在实现hadoop join时，用的inputformat和outputformat的简单实例：

hadoop join在http://blackproof.iteye.com/blog/1757530

自定义inputformat（泛型是maper的input）

   Java代码  
   
  
 public class MyInputFormat extends FileInputFormat<MultiKey,Employee> {  
       
     public MyInputFormat(){}  
   
     @Override  
     public RecordReader<MultiKey, Employee> createRecordReader(  
             InputSplit split, TaskAttemptContext context) throws IOException,  
             InterruptedException {  
         // TODO Auto-generated method stub  
         return new MyRecordReader();  
     }  
       
     public static class MyRecordReader extends RecordReader<MultiKey, Employee>{  
   
         public LineReader in;  
         public MultiKey key;  
         public Employee value;  
         public StringTokenizer token = null;  
           
         public Text line;  
           
         @Override  
         public void initialize(InputSplit split, TaskAttemptContext context)  
                 throws IOException, InterruptedException {  
             // TODO Auto-generated method stub  
             FileSplit fileSplit = (FileSplit)split;  
             Configuration job = context.getConfiguration();  
             Path file = fileSplit.getPath();  
             FileSystem fs = file.getFileSystem(job);  
               
             FSDataInputStream filein = fs.open(file);  
             in = new LineReader(filein, job);  
               
             key = new MultiKey();  
             value = new Employee();  
             line = new Text();  
         }  
   
         @Override  
         public boolean nextKeyValue() throws IOException, InterruptedException {  
   
             int linesize = in.readLine(line);  
             if(linesize==0)  
                 return false;  
             String[] pieces = line.toString().split(",");  
             int i = Integer.valueOf(pieces[0]);  
             switch (i) {  
             case 1:  
                 value.setEmpName(pieces[1]);  
                 value.setFlag(1);  
                 break;  
   
             default:  
                 value.setDepartName(pieces[1]);  
                 value.setFlag(2);  
                 break;  
             }  
             value.setDepartId(pieces[2]);  
             value.setDepartNo(pieces[3]);  
               
             key.setDepartId(value.getDepartId());  
             key.setDepartNo(value.getDepartNo());  
             return true;  
         }  
   
         @Override  
         public MultiKey getCurrentKey() throws IOException,  
                 InterruptedException {  
             // TODO Auto-generated method stub  
             return key;  
         }  
   
         @Override  
         public Employee getCurrentValue() throws IOException,  
                 InterruptedException {  
             // TODO Auto-generated method stub  
             return value;  
         }  
   
         @Override  
         public float getProgress() throws IOException, InterruptedException {  
             // TODO Auto-generated method stub  
             return 0;  
         }  
   
         @Override  
         public void close() throws IOException {  
             // TODO Auto-generated method stub  
               
         }  
           
     }  
   
 }  

自定义outputformat（泛型是reduce的输出）

   Java代码  
   
  
 public class MyOutputFormat extends FileOutputFormat<Text, Employee> {  
   
     @Override  
     public RecordWriter<Text, Employee> getRecordWriter(  
             TaskAttemptContext job) throws IOException, InterruptedException {  
         // TODO Auto-generated method stub  
         Configuration conf = job.getConfiguration();  
         Path file = getDefaultWorkFile(job, "");  
         FileSystem fs = file.getFileSystem(conf);  
         FSDataOutputStream fileOut = fs.create(file, false);  
         return new MyRecordWriter(fileOut);  
     }  
       
     public static class MyRecordWriter extends RecordWriter<Text, Employee>{  
   
         protected DataOutputStream out;  
         private final byte[] keyValueSeparator;  
          public static final String NEW_LINE = System.getProperty("line.separator");  
           
         public MyRecordWriter(DataOutputStream out){  
             this(out,":");  
         }  
           
         public MyRecordWriter(DataOutputStream out,String keyValueSeparator){  
             this.out = out;  
             this.keyValueSeparator = keyValueSeparator.getBytes();  
         }  
           
         @Override  
         public void write(Text key, Employee value) throws IOException,  
                 InterruptedException {  
             if(key!=null){  
                 out.write(key.toString().getBytes());  
                 out.write(keyValueSeparator);  
             }  
             out.write(value.toString().getBytes());  
             out.write(NEW_LINE.getBytes());  
         }  
   
         @Override  
         public void close(TaskAttemptContext context) throws IOException,  
                 InterruptedException {  
             out.close();  
         }  
           
     }  
   
 }