Spark+hadoop读取数据源码

阿卷啦

于 2021-12-01 21:57:01 发布

阅读量1.2k

点赞数

分类专栏： spark 文章标签： spark big data gitlab

本文链接：https://blog.youkuaiyun.com/weixin_45025143/article/details/121665923

版权

本文深入探讨了Spark如何利用Hadoop读取数据的源码实现，涉及关键步骤和技术细节，适合大数据开发者了解两者之间的交互原理。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

package com.jack.rdd.create;

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapreduce.lib.input;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.mapred.LocatedFileStatusFetcher;
import org.apache.hadoop.mapred.SplitLocationInfo;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.InvalidInputException;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StopWatch;
import org.apache.hadoop.util.StringUtils;

import com.google.common.collect.Lists;

/**
 * A base class for file-based {@link InputFormat}s.
 *
 * <p><code>FileInputFormat</code> is the base class for all file-based
 * <code>InputFormat</code>s. This provides a generic implementation of
 * {@link #getSplits(JobContext)}.
 * Subclasses of <code>FileInputFormat</code> can also override the
 * {@link #isSplitable(JobContext, Path)} method to ensure input-files are
 * not split-up and are processed as a whole by {@link Mapper}s.
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public abstract class FileInputFormat<K, V> extends InputFormat<K, V> {
   
    public static final String INPUT_DIR =
            "mapreduce.input.fileinputformat.inputdir";
    public static final String SPLIT_MAXSIZE =
            "mapreduce.input.fileinputformat.split.maxsize";
    public static final String SPLIT_MINSIZE =
            "mapreduce.input.fileinputformat.split.minsize";
    public static final String PATHFILTER_CLASS =
            "mapreduce.input.pathFilter.class";
    public static final String NUM_INPUT_FILES =
            "mapreduce.input.fileinputformat.numinputfiles";
    public static final String INPUT_DIR_RECURSIVE =
            "mapreduce.input.fileinputformat.input.dir.recursive";
    public static final String LIST_STATUS_NUM_THREADS =
            "mapreduce.input.fileinputformat.list-status.num-threads";
    public static final int DEFAULT_LIST_STATUS_NUM_THREADS = 1;

    private static final Log LOG = LogFactory.getLog(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.class);

    private static final double SPLIT_SLOP = 1.1;   // 10% slop

    @Deprecated
    public static enum Counter {
   
        BYTES_READ
    }

    private static final PathFilter hiddenFileFilter = new PathFilter(){
   
        public boolean accept(Path p){
   
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    };

    /**
     * Proxy PathFilter that accepts a path only if all filters given in the
     * constructor do. Used by the listPaths() to apply the built-in
     * hiddenFileFilter together with a user provided one (if any).
     */
    private static class MultiPathFilter implements PathFilter {
   
        private List<PathFilter> filters;

        public MultiPathFilter(List<PathFilter> filters) {
   
            this.filters = filters;
        }

        public boolean accept(Path path) {
   
            for (PathFilter filter : filters) {
   
                if (!filter.accept(path)) {
   
                    return fa

最低0.47元/天解锁文章