利用CombineFileInputFormat处理小文件

最新推荐文章于 2021-03-24 10:59:07 发布

原创最新推荐文章于 2021-03-24 10:59:07 发布 · 1.2k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#hadoop #CombineFileInputForm

hadoop 专栏收录该内容

5 篇文章

订阅专栏

本文介绍如何使用CombineFileInputFormat在Hadoop中优化处理小文件的性能，通过自定义继承CombineFileInputFormat并合理设置配置参数，减少Map任务数量，显著提升计算效率。

本文转自我的原创blog: http://www.javali.org/document/deal_small_files_with_combinefileinputformat_in_mapreduce.html

在之前的文章里hadoop处理小文件问题使用hadoop archive files来解决海量小文件引起的资源及性能问题。该方案需人工进行维护，适用管理人员的操作，而且har文件一旦创建，Archives便不可改变，所以适合一次性写入大量小文件的场景。

hadoop自带的还有另一种解决方案：CombineFileInputFormat

CombineFileInputFormat是一个抽象类,必须要自定义继承它才能使用。

 
        1
       
        2
       
        3
       
        4
       
        5
       
        6
       
        7
       
        8
       
        9
       
        10
       
        11
       
        12
       
        13
       
        14
       
        15
       
        16
       
        17
       
        18
       
        19
       
        20
       
        21
       
        22
       
        23
       
        24
       
        25
       
        26
       
        27
       
        28
       
        29
       
        30
       
        31
       
        32
       
        33
       
        34
       
        35
       
        36
       
        37
       
        38
       
        39
       
        40
       
        41
       
        42
       
        43
       
        44
       
        45
       
        46
       
        47
       
        48
       
        49
       
        50
       
        51
       
        52
       
        53
       
        54
       
        55
       
        56
       
        57
       
        58
       
        59
       
        60
       
        public
         
        class
         
        MyCombineInputFormat 
        extends
         
        CombineFileInputFormat
        <
        Text
        ,
         
        Text
        > 
         
        {
       
        public
         
        RecordReader
        <
        Text
        ,
         
        Text
        >
         
        createRecordReader
        (
        InputSplit 
        split
        ,
         
        TaskAttemptContext 
        context
        )
         
        throws
         
        IOException
         
        {
       
        CombineFileSplit 
        combineSplit
         
        =
         
        (
        CombineFileSplit
        )
         
        split
        ;
       
        RecordReader 
        rr
         
        =
         
        new
         
        CombineFileRecordReader
        (
        combineSplit
        ,
         
        context
        ,
         
        myCombineFileRecordReader
        .
        class
        )
        ;
       
        return
         
        rr
        ;
       
        }
       
        public
         
        static
         
        class
         
        myCombineFileRecordReader 
        extends
         
        RecordReader
        <
        LongWritable
        ,
         
        Text
        >
         
        {
       
        private
         
        LineRecordReader 
        linerecord
        ;
       
        private
         
        int
         
        index
        ;
       
        public
         
        myCombineFileRecordReader
        (
        CombineFileSplit 
        split
        ,
         
        TaskAttemptContext 
        context
        ,
         
        Integer
         
        index
        )
         
        throws
         
        IOException
        ,
         
        InterruptedException
         
        {
       
        this
        .
        index
         
        =
         
        index
        ;
       
        InputSplit 
        is
         
        =
         
        (
        InputSplit
        )
         
        split
        ;
       
        }
       
        public
         
        void
         
        initialize
        (
        InputSplit 
        split
        ,
         
        TaskAttemptContext 
        context
        )
         
        throws
         
        IOException
         
        {
       
        CombineFileSplit 
        combineSplit
         
        =
         
        (
        CombineFileSplit
        )
         
        split
        ;
       
        linerecord
         
        =
         
        new
         
        LineRecordReader
        (
        )
        ;
       
        FileSplit 
        fileSplit
         
        =
         
        new
         
        FileSplit
        (
        combineSplit
        .
        getPath
        (
        index
        )
        ,
         
        combineSplit
        .
        getOffset
        (
        index
        )
        ,
         
        combineSplit
        .
        getLength
        (
        index
        )
        ,
         
        combineSplit
        .
        getLocations
        (
        )
        )
        ;
       
        linerecord
        .
        initialize
        (
        fileSplit
        ,
         
        context
        )
        ;
       
        }
       
        @Override
       
        public
         
        void
         
        close
        (
        )
         
        throws
         
        IOException
         
        {
       
        if
         
        (
        linerecord
         
        !=
         
        null
        )
         
        {
       
        linerecord
        .
        close
        (
        )
        ;
       
        linerecord
         
        =
         
        null
        ;
       
        }
       
        }
       
        @Override
       
        public
         
        Text 
        getCurrentValue
        (
        )
         
        {
       
        return
         
        linerecord
        .
        getCurrentValue
        (
        )
        ;
       
        }
       
        @Override
       
        public
         
        LongWritable 
        getCurrentKey
        (
        )
         
        {
       
        return
         
        linerecord
        .
        getCurrentKey
        (
        )
        ;
       
        }
       
        @Override
       
        public
         
        float
         
        getProgress
        (
        )
         
        throws
         
        IOException
         
        {
       
        return
         
        linerecord
        .
        getProgress
        (
        )
        ;
       
        }
       
        @Override
       
        public
         
        boolean
         
        nextKeyValue
        (
        )
         
        throws
         
        IOException
         
        {
       
        return
         
        linerecord
        .
        nextKeyValue
        (
        )
        ;
       
        }
       
        }
       
        }

然后启动Job时就能直接使用MyCombineInputFormat了

   
   
   
   
   
   Java
  

  
 
 
  
         1
       

         2
       

         3
       

         4
       
 
        Job
        job
        
        =
        
        new
        
        Job
        (
        conf
        ,
        
        "Market Prom Job"
        )
        ;
       
 
        job
        .
        setInputFormatClass
        (
        MyCombineInputFormat
        .
        class
        )
        ;
       
 
        job
        .
        setOutputFormatClass
        (
        TextOutputFormat
        .
        class
        )
        ;
       
 
        MultipleInputs
        .
        addInputPath
        (
        job
        ,
        
        new
        
        Path
        (
        args
        [
        0
        ]
        )
        ,
        
        MyCombineInputFormat
        .
        class
        ,
        
        LogMap
        .
        class
        )
        ;
       
 
 

另外影响Map数的还有三个参数

 
         1
       
         2
       
         3
       
         4
       
        Configuration
        conf
        
        =
        
        new
        
        Configuration
        (
        )
        ;
       
        conf
        .
        setLong
        (
        "mapred.min.split.size.per.node"
        ,
        
        128
        
        *
        
        1024
        
        *
        
        1024
        )
        ;
         
        conf
        .
        setLong
        (
        "mapred.min.split.size.per.rack"
        ,
        
        128
        
        *
        
        1024
        
        *
        
        1024
        )
        ;
         
        conf
        .
        setLong
        (
        "mapred.max.split.size"
        ,
        
        128
        *
        
        1024
        
        *
        
        1024
        )
        ;