hdfs-3.1.3——NN&DN启动源码剖析

List of articles

一、前言

二、namenode启动流程

2.1.入口

2.2.启动HTTPServer

2.3.加载镜像文件和编辑日志到内存

2.4.初始化NameNode的RPC服务端

2.5.NameNode启动资源检查

2.6.NameNode对DataNode的心跳超时进行判断

2.7.安全模式

三、DataNode启动流程

3.1.入口

3.2.初始化DataXceiver服务,用于接收客户端Streamer的socket请求或其他DataNode发送过来的socket请求

3.3.初始化HTTP服务

3.4.初始化DataNode的RPC服务端

3.5.DataNode向NameNode注册

3.6.向NN发送心跳


一、前言

本文主要介绍当我们启动hadoop集群时,NameNode和DataNode的启动流程。

hdfs这个组件在启动的时候会开启三个JVM实例——namenode、datanode、secondary namenode

二、namenode启动流程

2.1.入口

因为namenode是一个jvm进程,所以原理就是找到namenode.java源文件,执行其main方法

/**
namenode.java
*/
public static void main(String argv[]) throws Exception {
    if (DFSUtil.parseHelpArgument(argv, NameNode.USAGE, System.out, true)) {
      System.exit(0);
    }

    try {
      StringUtils.startupShutdownMessage(NameNode.class, argv, LOG);
      //调用本文件内的createNameNode函数,创建namenode对象
      NameNode namenode = createNameNode(argv, null);
      if (namenode != null) {
        namenode.join();
      }
    } catch (Throwable e) {
      LOG.error("Failed to start namenode.", e);
      terminate(1, e);
    }
  }
public static NameNode createNameNode(String argv[], Configuration conf)
      throws IOException {
    LOG.info("createNameNode " + Arrays.asList(argv));
    if (conf == null)
      conf = new HdfsConfiguration();
    // Parse out some generic args into Configuration.
    GenericOptionsParser hParser = new GenericOptionsParser(conf, argv);
    argv = hParser.getRemainingArgs();
    // Parse the rest, NN specific args.
    StartupOption startOpt = parseArguments(argv);
    if (startOpt == null) {
      printUsage(System.err);
      return null;
    }
    setStartupOption(conf, startOpt);

    boolean aborted = false;
    switch (startOpt) {
    case FORMAT:
      aborted = format(conf, startOpt.getForceFormat(),
          startOpt.getInteractiveFormat());
      terminate(aborted ? 1 : 0);
      return null; // avoid javac warning
    case GENCLUSTERID:
      System.err.println("Generating new cluster id:");
      System.out.println(NNStorage.newClusterID());
      terminate(0);
      return null;
    case ROLLBACK:
      aborted = doRollback(conf, true);
      terminate(aborted ? 1 : 0);
      return null; // avoid warning
    case BOOTSTRAPSTANDBY:
      String[] toolArgs = Arrays.copyOfRange(argv, 1, argv.length);
      int rc = BootstrapStandby.run(toolArgs, conf);
      terminate(rc);
      return null; // avoid warning
    case INITIALIZESHAREDEDITS:
      aborted = initializeSharedEdits(conf,
          startOpt.getForceFormat(),
          startOpt.getInteractiveFormat());
      terminate(aborted ? 1 : 0);
      return null; // avoid warning
    case BACKUP:
    case CHECKPOINT:
      NamenodeRole role = startOpt.toNodeRole();
      DefaultMetricsSystem.initialize(role.toString().replace(" ", ""));
      return new BackupNode(conf, role);
    case RECOVER:
      NameNode.doRecovery(startOpt, conf);
      return null;
    case METADATAVERSION:
      printMetadataVersion(conf);
      terminate(0);
      return null; // avoid javac warning
    case UPGRADEONLY:
      DefaultMetricsSystem.initialize("NameNode");
      new NameNode(conf);
      terminate(0);
      return null;
    default:
      DefaultMetricsSystem.initialize("NameNode");
      //讲白了就是通过构造器new一个NameNode对象
      return new NameNode(conf);
    }
  }
/**
NameNode构造函数
*/
protected NameNode(Configuration conf, NamenodeRole role)
      throws IOException {
    super(conf);
    this.tracer = new Tracer.Builder("NameNode").
        conf(TraceUtils.wrapHadoopConf(NAMENODE_HTRACE_PREFIX, conf)).
        build();
    this.tracerConfigurationManager =
        new TracerConfigurationManager(NAMENODE_HTRACE_PREFIX, conf);
    this.role = role;
    String nsId = getNameServiceId(conf);
    String namenodeId = HAUtil.getNameNodeId(conf, nsId);
    clientNamenodeAddress = NameNodeUtils.getClientNamenodeAddress(
        conf, nsId);

    if (clientNamenodeAddress != null) {
      LOG.info("Clients should use {} to access"
          + " this namenode/service.", clientNamenodeAddress);
    }
    //是否开启namenodeHA
    this.haEnabled = HAUtil.isHAEnabled(conf, nsId);
    state = createHAState(getStartupOption(conf));
    this.allowStaleStandbyReads = HAUtil.shouldAllowStandbyReads(conf);
    //创建HA上下文环境
    this.haContext = createHAContext();
    try {
      initializeGenericKeys(conf, nsId, namenodeId);
      //调用initialize函数,执行真正namenode的启动流程
      initialize(getConf());
      try {
        haContext.writeLock();
        state.prepareToEnterState(haContext);
        state.enterState(haContext);
      } finally {
        haContext.writeUnlock();
      }
    } catch (IOException e) {
      this.stopAtException(e);
      throw e;
    } catch (HadoopIllegalArgumentException e) {
      this.stopAtException(e);
      throw e;
    }
    this.started.set(true);
  }
/**
   * Initialize name-node.
   * 
   * @param conf the configuration
   */
  protected void initialize(Configuration conf) throws IOException {
    if (conf.get(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS) == null) {
      String intervals = conf.get(DFS_METRICS_PERCENTILES_INTERVALS_KEY);
      if (intervals != null) {
        conf.set(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS,
          intervals);
      }
    }

    UserGroupInformation.setConfiguration(conf);
    loginAsNameNodeUser(conf);

    NameNode.initMetrics(conf, this.getRole());
    StartupProgressMetrics.register(startupProgress);

    pauseMonitor = new JvmPauseMonitor();
    pauseMonitor.init(conf);
    pauseMonitor.start();
    metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);

    if (NamenodeRole.NAMENODE == role) {
      //启动流程-1:检查namenode权限,然后启动httpserver,让我们用户可以通过http请求获得web ui 
      //dw105:9870
      startHttpServer(conf);
    }
    //启动流程-2:调用本地文件中的loadNamesystem方法,加载镜像文件与编辑日志到内存中
    loadNamesystem(conf);
    startAliasMapServerIfNecessary(conf);
    
    //启动流程-3:调用createRpcServer方法,创建NameNodeRpc服务端
    rpcServer = createRpcServer(conf);

    initReconfigurableBackoffKey();

    if (clientNamenodeAddress == null) {
      // This is expected for MiniDFSCluster. Set it now using 
      // the RPC server's bind address.
      clientNamenodeAddress = 
          NetUtils.getHostPortString(getNameNodeAddress());
      LOG.info("Clients are to use " + clientNamenodeAddress + " to access"
          + " this namenode/service.");
    }
    if (NamenodeRole.NAMENODE == role) {
      httpServer.setNameNodeAddress(getNameNodeAddress());
      httpServer.setFSImage(getFSImage());
    }
    
    //启动流程-4:NameNode启动资源检查 & 启动流程-5:DataNode心跳超时判断
    startCommonServices(conf);
    startMetricsLogger(conf);
  }

2.2.启动HTTPServer

private void startHttpServer(final Configuration conf) throws IOException {
    //通过new NameNodeHttpServer对象获得httpServer,然后调用该对象的start方法
    httpServer = new NameNodeHttpServer(conf, this, getHttpServerBindAddress(conf));
    httpServer.start();
    httpServer.setStartupProgress(startupProgress);
  }
protected InetSocketAddress getHttpServerBindAddress(Configuration conf) {
    InetSocketAddress bindAddress = getHttpServerAddress(conf);

    // If DFS_NAMENODE_HTTP_BIND_HOST_KEY exists then it overrides the
    // host name portion of DFS_NAMENODE_HTTP_ADDRESS_KEY.
    final String bindHost = conf.getTrimmed(DFS_NAMENODE_HTTP_BIND_HOST_KEY);
    if (bindHost != null && !bindHost.isEmpty()) {
      bindAddress = new InetSocketAddress(bindHost, bindAddress.getPort());
    }

    return bindAddress;
  }
/** @return the NameNode HTTP address. */
public static InetSocketAddress getHttpAddress(Configuration conf) {
   return  NetUtils.createSocketAddr(
     conf.getTrimmed(DFS_NAMENODE_HTTP_ADDRESS_KEY, DFS_NAMENODE_HTTP_ADDRESS_DEFAULT));
}

public static final String DFS_NAMENODE_HTTP_ADDRESS_DEFAULT = "0.0.0.0:" + 
DFS_NAMENODE_HTTP_PORT_DEFAULT;

public static final int DFS_NAMENODE_HTTP_PORT_DEFAULT =
HdfsClientConfigKeys.DFS_NAMENODE_HTTP_PORT_DEFAULT;

int DFS_NAMENODE_HTTP_PORT_DEFAULT = 9870;

void start() throws IOException {
    HttpConfig.Policy policy = DFSUtil.getHttpPolicy(conf);
    final String infoHost = bindAddress.getHostName();

    final InetSocketAddress httpAddr = bindAddress;
    final String httpsAddrString = conf.getTrimmed(
        DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_KEY,
        DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_DEFAULT);
    InetSocketAddress httpsAddr = NetUtils.createSocketAddr(httpsAddrString);

    if (httpsAddr != null) {
      // If DFS_NAMENODE_HTTPS_BIND_HOST_KEY exists then it overrides the
      // host name portion of DFS_NAMENODE_HTTPS_ADDRESS_KEY.
      final String bindHost =
          conf.getTrimmed(DFSConfigKeys.DFS_NAMENODE_HTTPS_BIND_HOST_KEY);
      if (bindHost != null && !bindHost.isEmpty()) {
        httpsAddr = new InetSocketAddress(bindHost, httpsAddr.getPort());
      }
    }
    //Hadoop封装了自己的HttpServer,形成了自己的HttpServer2
    HttpServer2.Builder builder = DFSUtil.httpServerTemplateForNNAndJN(conf,
        httpAddr, httpsAddr, "hdfs",
        DFSConfigKeys.DFS_NAMENODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY,
        DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY);

    final boolean xFrameEnabled = conf.getBoolean(
        DFSConfigKeys.DFS_XFRAME_OPTION_ENABLED,
        DFSConfigKeys.DFS_XFRAME_OPTION_ENABLED_DEFAULT);

    final String xFrameOptionValue = conf.getTrimmed(
        DFSConfigKeys.DFS_XFRAME_OPTION_VALUE,
        DFSConfigKeys.DFS_XFRAME_OPTION_VALUE_DEFAULT);

    builder.configureXFrame(xFrameEnabled).setXFrameOption(xFrameOptionValue);

    httpServer = builder.build();

    if (policy.isHttpsEnabled()) {
      // assume same ssl port for all datanodes
      InetSocketAddress datanodeSslPort = NetUtils.createSocketAddr(conf.getTrimmed(
          DFSConfigKeys.DFS_DATANODE_HTTPS_ADDRESS_KEY, infoHost + ":"
              + DFSConfigKeys.DFS_DATANODE_HTTPS_DEFAULT_PORT));
      httpServer.setAttribute(DFSConfigKeys.DFS_DATANODE_HTTPS_PORT_KEY,
          datanodeSslPort.getPort());
    }

    initWebHdfs(conf, bindAddress.getHostName(), httpServer,
        NamenodeWebHdfsMethods.class.getPackage().getName());

    httpServer.setAttribute(NAMENODE_ATTRIBUTE_KEY, nn);
    httpServer.setAttribute(JspHelper.CURRENT_CONF, conf);
    //进入该函数
    setupServlets(httpServer, conf);
    //真正启动httpServer服务
    httpServer.start();

    int connIdx = 0;
    if (policy.isHttpEnabled()) {
      httpAddress = httpServer.getConnectorAddress(connIdx++);
      conf.set(DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY,
          NetUtils.getHostPortString(httpAddress));
    }

    if (policy.isHttpsEnabled()) {
      httpsAddress = httpServer.getConnectorAddress(connIdx);
      conf.set(DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_KEY,
          NetUtils.getHostPortString(httpsAddress));
    }
  }
private static void setupServlets(HttpServer2 httpServer, Configuration conf) {
    httpServer.addInternalServlet("startupProgress",
        StartupProgressServlet.PATH_SPEC, StartupProgressServlet.class);
    httpServer.addInternalServlet("fsck", "/fsck", FsckServlet.class,
        true);
    httpServer.addInternalServlet("imagetransfer", ImageServlet.PATH_SPEC,
        ImageServlet.class, true);
  }

2.3.加载镜像文件和编辑日志到内存

进入loadNamesystem方法

/**
NameNode.java
*/
protected void loadNamesystem(Configuration conf) throws IOException {
   this.namesystem = FSNamesystem.loadFromDisk(conf);
}

static FSNamesystem loadFromDisk(Configuration conf) throws IOException {

    checkConfiguration(conf);
    FSImage fsImage = new FSImage(conf,
        FSNamesystem.getNamespaceDirs(conf),
        FSNamesystem.getNamespaceEditsDirs(conf));
    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
    StartupOption startOpt = NameNode.getStartupOption(conf);
    if (startOpt == StartupOption.RECOVER) {
      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
    }

    long loadStart = monotonicNow();
    try {
      namesystem.loadFSImage(startOpt);
    } catch (IOException ioe) {
      LOG.warn("Encountered exception loading fsimage", ioe);
      fsImage.close();
      throw ioe;
    }
    long timeTakenToLoadFSImage = monotonicNow() - loadStart;
    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
    if (nnMetrics != null) {
      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
    }
    namesystem.getFSDirectory().createReservedStatuses(namesystem.getCTime());
    return namesystem;
}

2.4.初始化NameNode的RPC服务端

调用createRpcServer方法

/**
NameNode.java
*/
rpcServer = createRpcServer(conf);

/**
* Create the RPC server implementation. Used as an extension point for the
* BackupNode.
*/
protected NameNodeRpcServer createRpcServer(Configuration conf)
    throws IOException {
  return new NameNodeRpcServer(conf, this);
}

/**
NameNodeRpcServer.java  实现了NameNodeProtocol RPC协议接口
*/

public class NameNodeRpcServer implements NamenodeProtocols {

    ...


    public NameNodeRpcServer(Configuration conf, NameNode nn)
      throws IOException {

        ...

         serviceRpcServer = new RPC.Builder(conf)
          .setProtocol(
              org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB.class)
          .setInstance(clientNNPbService)
          .setBindAddress(bindHost)
          .setPort(serviceRpcAddr.getPort())
          .setNumHandlers(serviceHandlerCount)
          .setVerbose(false)
          .setSecretManager(namesystem.getDelegationTokenSecretManager())
          .build();
        ...
    }

    ...

}

2.5.NameNode启动资源检查

进入startCommonService方法

/**

NameNode.java
*/

startCommonServices(conf);

/** Start the services common to active and standby states */
  private void startCommonServices(Configuration conf) throws IOException {
    namesystem.startCommonServices(conf, haContext);
    registerNNSMXBean();
    if (NamenodeRole.NAMENODE != role) {
      startHttpServer(conf);
      httpServer.setNameNodeAddress(getNameNodeAddress());
      httpServer.setFSImage(getFSImage());
    }
    rpcServer.start();
    try {
      plugins = conf.getInstances(DFS_NAMENODE_PLUGINS_KEY,
          ServicePlugin.class);
    } catch (RuntimeException e) {
      String pluginsValue = conf.get(DFS_NAMENODE_PLUGINS_KEY);
      LOG.error("Unable to load NameNode plugins. Specified list of plugins: " +
          pluginsValue, e);
      throw e;
    }
    for (ServicePlugin p: plugins) {
      try {
        p.start(this);
      } catch (Throwable t) {
        LOG.warn("ServicePlugin " + p + " could not be started", t);
      }
    }
    LOG.info(getRole() + " RPC up at: " + getNameNodeAddress());
    if (rpcServer.getServiceRpcAddress() != null) {
      LOG.info(getRole() + " service RPC up at: "
          + rpcServer.getServiceRpcAddress());
    }
  }
/**
FSNamesystem.java
*/
void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
    this.registerMBean(); // register the MBean for the FSNamesystemState
    writeLock();
    this.haContext = haContext;
    try {
      nnResourceChecker = new NameNodeResourceChecker(conf);
      //检查是否有足够的磁盘存储元数据,FSImage和Edits(默认都是100m)
      checkAvailableResources();
      assert !blockManager.isPopulatingReplQueues();
      StartupProgress prog = NameNode.getStartupProgress();
      prog.beginPhase(Phase.SAFEMODE);
      long completeBlocksTotal = getCompleteBlocksTotal();
      //安全模式
      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
          completeBlocksTotal);
      //启动块服务
      blockManager.activate(conf, completeBlocksTotal);
    } finally {
      writeUnlock("startCommonServices");
    }
    
    registerMXBean();
    DefaultMetricsSystem.instance().register(this);
    if (inodeAttributeProvider != null) {
      inodeAttributeProvider.start();
      dir.setINodeAttributeProvider(inodeAttributeProvider);
    }
    snapshotManager.registerMXBean();
    InetSocketAddress serviceAddress = NameNode.getServiceAddress(conf, true);
    this.nameNodeHostName = (serviceAddress != null) ?
        serviceAddress.getHostName() : "";
  }

进入NameNodeResourceChecker(conf)构造方法

/**
   * Create a NameNodeResourceChecker, which will check the edits dirs and any
   * additional dirs to check set in <code>conf</code>.
   */
  public NameNodeResourceChecker(Configuration conf) throws IOException {
    this.conf = conf;
    volumes = new HashMap<String, CheckedVolume>();
    //dfs.namenode.resource.du.reversed 默认值 1024*1024*100=100m
    duReserved = conf.getLong(DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_KEY,
        DFSConfigKeys.DFS_NAMENODE_DU_RESERVED_DEFAULT);
    
    Collection<URI> extraCheckedVolumes = Util.stringCollectionAsURIs(conf
        .getTrimmedStringCollection(DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_KEY));
    
    Collection<URI> localEditDirs = Collections2.filter(
        FSNamesystem.getNamespaceEditsDirs(conf),
        new Predicate<URI>() {
          @Override
          public boolean apply(URI input) {
            if (input.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) {
              return true;
            }
            return false;
          }
        });

    // Add all the local edits dirs, marking some as required if they are
    // configured as such.
    //对所有路径进行资源检查
    for (URI editsDirToCheck : localEditDirs) {
      addDirToCheck(editsDirToCheck,
          FSNamesystem.getRequiredNamespaceEditsDirs(conf).contains(
              editsDirToCheck));
    }

    // All extra checked volumes are marked "required"
    for (URI extraDirToCheck : extraCheckedVolumes) {
      addDirToCheck(extraDirToCheck, true);
    }
    
    minimumRedundantVolumes = conf.getInt(
        DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_KEY,
        DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_DEFAULT);
  }

再进入checkAvailableResources()方法

/**
FSNamesystem.java
*/
/**
   * Perform resource checks and cache the results.
   */
  void checkAvailableResources() {
    long resourceCheckTime = monotonicNow();
    Preconditions.checkState(nnResourceChecker != null,
        "nnResourceChecker not initialized");
    //判断资源是否足够
    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
    resourceCheckTime = monotonicNow() - resourceCheckTime;
    NameNode.getNameNodeMetrics().addResourceCheckTime(resourceCheckTime);
  }

/**
进入hasAvailableDiskSpacek,NameNodeRescourceChecker.java
*/
public boolean hasAvailableDiskSpace() {
    return NameNodeResourcePolicy.areResourcesAvailable(volumes.values(),
        minimumRedundantVolumes);
  }
static boolean areResourcesAvailable(
      Collection<? extends CheckableNameNodeResource> resources,
      int minimumRedundantResources) {

    // TODO: workaround:
    // - during startup, if there are no edits dirs on disk, then there is
    // a call to areResourcesAvailable() with no dirs at all, which was
    // previously causing the NN to enter safemode
    if (resources.isEmpty()) {
      return true;
    }
    
    int requiredResourceCount = 0;
    int redundantResourceCount = 0;
    int disabledRedundantResourceCount = 0;
    //判断资源是否充足
    for (CheckableNameNodeResource resource : resources) {
      if (!resource.isRequired()) {
        redundantResourceCount++;
        if (!resource.isResourceAvailable()) {
          disabledRedundantResourceCount++;
        }
      } else {
        requiredResourceCount++;
        if (!resource.isResourceAvailable()) {
          // Short circuit - a required resource is not available.//不充足则返回false
          return false;
        }
      }
    }
    
    if (redundantResourceCount == 0) {
      // If there are no redundant resources, return true if there are any
      // required resources available.
      return requiredResourceCount > 0;
    } else {
      return redundantResourceCount - disabledRedundantResourceCount >=
          minimumRedundantResources;
    }
  }

isResourceAvailable是个接口方法,找到其实现类中该方法的实现

/**
NameNodeResourceChecker.java
*/
@Override
    public boolean isResourceAvailable() {
      //获取当前目录所挂在的空间大小
      long availableSpace = df.getAvailable();
      if (LOG.isDebugEnabled()) {
        LOG.debug("Space available on volume '" + volume + "' is "
            + availableSpace);
      }
      //如果当前可用空间大小小于100M,返回false
      if (availableSpace < duReserved) {
        LOG.warn("Space available on volume '" + volume + "' is "
            + availableSpace +
            ", which is below the configured reserved amount " + duReserved);
        return false;
      } else {
        return true;
      }
    }


2.6.NameNode对DataNode的心跳超时进行判断

Ctrl + n 搜索 namenode,ctrl + f 搜索 startCommonServices

点击 namesystem.startCommonServices(conf, haContext);

点击 blockManager.activate(conf, completeBlocksTotal);

点击 datanodeManager.activate(conf);

/** DatanodeManager.java */
void activate(final Configuration conf) {
    datanodeAdminManager.activate(conf);
    heartbeatManager.activate();
  }

/** HeartbeatManager.java */
void activate() {
    heartbeatThread.start();
  }

//因为是线程,所以我们找到其任务体
@Override
    public void run() {
      while(namesystem.isRunning()) {
        restartHeartbeatStopWatch();
        try {
          final long now = Time.monotonicNow();
          if (lastHeartbeatCheck + heartbeatRecheckInterval < now) {
            heartbeatCheck();
            lastHeartbeatCheck = now;
          }
          if (blockManager.shouldUpdateBlockKey(now - lastBlockKeyUpdate)) {
            synchronized(HeartbeatManager.this) {
              for(DatanodeDescriptor d : datanodes) {
                d.setNeedKeyUpdate(true);
              }
            }
            lastBlockKeyUpdate = now;
          }
        } catch (Exception e) {
          LOG.error("Exception while checking heartbeat", e);
        }
        try {
          Thread.sleep(5000);  // 5 seconds
        } catch (InterruptedException ignored) {
        }
        // avoid declaring nodes dead for another cycle if a GC pause lasts
        // longer than the node recheck interval
        if (shouldAbortHeartbeatCheck(-5000)) {
          LOG.warn("Skipping next heartbeat scan due to excessive pause");
          lastHeartbeatCheck = Time.monotonicNow();
        }
      }
    }

//看到heartbeatCheck方法,进入
void heartbeatCheck() {
    final DatanodeManager dm = blockManager.getDatanodeManager();
    // It's OK to check safe mode w/o taking the lock here, we re-check
    // for safe mode after taking the lock before removing a datanode.
    if (namesystem.isInStartupSafeMode()) {
      return;
    }
    boolean allAlive = false;
    while (!allAlive) {
      // locate the first dead node.
      DatanodeDescriptor dead = null;

      // locate the first failed storage that isn't on a dead node.
      DatanodeStorageInfo failedStorage = null;

      // check the number of stale nodes
      int numOfStaleNodes = 0;
      int numOfStaleStorages = 0;
      synchronized(this) {
        for (DatanodeDescriptor d : datanodes) {
          // check if an excessive GC pause has occurred
          if (shouldAbortHeartbeatCheck(0)) {
            return;
          }
          if (dead == null && dm.isDatanodeDead(d)) {
            stats.incrExpiredHeartbeats();
            dead = d;
          }
    ...
}

//进入isDatanodeDead判断该datanode是否宕机的方法
/** Is the datanode dead? */
  boolean isDatanodeDead(DatanodeDescriptor node) {
    return (node.getLastUpdateMonotonic() <
            (monotonicNow() - heartbeatExpireInterval));
  }

//和heartbeatExpireInterval这个参数有关
this.heartbeatExpireInterval = 2 * heartbeatRecheckInterval
        + 10 * 1000 * heartbeatIntervalSeconds;//2 * 5min + 10 * 1000 * 3 = 10min + 30s

heartbeatRecheckInterval = conf.getInt(
        DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, 
        DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT);

public static final int DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT = 5*60*1000;//5min

heartbeatIntervalSeconds = conf.getTimeDuration(
        DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
        DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT, TimeUnit.SECONDS);

public static final long DFS_HEARTBEAT_INTERVAL_DEFAULT = 3;



2.7.安全模式

/** FSNamesystem.java */
void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
 this.registerMBean(); // register the MBean for the FSNamesystemState
 writeLock();
 this.haContext = haContext;
 try {
 nnResourceChecker = new NameNodeResourceChecker(conf);
 // 检查是否有足够的磁盘存储元数据(fsimage(默认 100m)editLog(默认 100m))
 checkAvailableResources();
 assert !blockManager.isPopulatingReplQueues();
 StartupProgress prog = NameNode.getStartupProgress();
 // 开始进入安全模式
 prog.beginPhase(Phase.SAFEMODE);
 // 获取所有可以正常使用的 block
long completeBlocksTotal = getCompleteBlocksTotal();
 prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
 completeBlocksTotal);
 // 启动块服务
 blockManager.activate(conf, completeBlocksTotal);
 } finally {
 writeUnlock("startCommonServices");
}
 
 registerMXBean();
 DefaultMetricsSystem.instance().register(this);
 if (inodeAttributeProvider != null) {
 inodeAttributeProvider.start();
 dir.setINodeAttributeProvider(inodeAttributeProvider);
 }
 snapshotManager.registerMXBean();
 InetSocketAddress serviceAddress = NameNode.getServiceAddress(conf, true);
 this.nameNodeHostName = (serviceAddress != null) ?
 serviceAddress.getHostName() : "";
}
public long getCompleteBlocksTotal() {
    // Calculate number of blocks under construction
    long numUCBlocks = 0;
    readLock();
    try {
      //获取正在构建的块
      numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
      // 获取所有的块 - 正在构建的 block = 可以正常使用的 block
      return getBlocksTotal() - numUCBlocks;
    } finally {
      readUnlock("getCompleteBlocksTotal");
    }
  }
**
   * Initialize the safe mode information.
   * @param total initial total blocks
   */
  void activate(long total) {
    assert namesystem.hasWriteLock();
    assert status == BMSafeModeStatus.OFF;

    startTime = monotonicNow();
    //计算是否满足块个数的阈值,满足阈值才能说明系统是安全的
    setBlockTotal(total);
    //判断DataNode节点和块信息是否达到退出安全模式的标准
    if (areThresholdsMet()) {
      boolean exitResult = leaveSafeMode(false);
      Preconditions.checkState(exitResult, "Failed to leave safe mode.");
    } else {
      // enter safe mode
      status = BMSafeModeStatus.PENDING_THRESHOLD;
      initializeReplQueuesIfNecessary();
      reportStatus("STATE* Safe mode ON.", true);
      lastStatusReport = monotonicNow();
    }
  }
//进入setBlockTotal
void setBlockTotal(long total) {
 assert namesystem.hasWriteLock();
 synchronized (this) {
 this.blockTotal = total;
// 计算阈值:例如:1000 个正常的块 * 0.999 = 999
 this.blockThreshold = (long) (total * threshold);
 }
 
 this.blockReplQueueThreshold = (long) (total * replQueueThreshold);
}

this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
 DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);

public static final float DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT = 0.999f;
//进入areThresholdsMet方法判断是否可以退出安全模式
private boolean areThresholdsMet() {
 assert namesystem.hasWriteLock();
 // Calculating the number of live datanodes is time-consuming
 // in large clusters. Skip it when datanodeThreshold is zero.
 int datanodeNum = 0;
 if (datanodeThreshold > 0) {
 datanodeNum = blockManager.getDatanodeManager().getNumLiveDataNodes();
 }
 synchronized (this) {
 // 已经正常注册的块数 》= 块的最小阈值 》=最小可用 DataNode
 return blockSafe >= blockThreshold && datanodeNum >= datanodeThreshold;
 }
}

三、DataNode启动流程

3.1.入口

同样地,由于DataNode是hdfs启动时开启的一个JVM进程实例,所以我们找到DataNode.java源文件,执行其中的main方法。

DataNode.java

public static void main(String args[]) {
    if (DFSUtil.parseHelpArgument(args, DataNode.USAGE, System.out, true)) {
      System.exit(0);
    }

    secureMain(args, null);
  }

进入secureMain(args, null)方法

public static void secureMain(String args[], SecureResources resources) {
    int errorCode = 0;
    try {
      StringUtils.startupShutdownMessage(DataNode.class, args, LOG);
      DataNode datanode = createDataNode(args, null, resources);
      if (datanode != null) {
        datanode.join();
      } else {
        errorCode = 1;
      }
    } catch (Throwable e) {
      LOG.error("Exception in secureMain", e);
      terminate(1, e);
    } finally {
      // We need to terminate the process here because either shutdown was called
      // or some disk related conditions like volumes tolerated or volumes required
      // condition was not met. Also, In secure mode, control will go to Jsvc
      // and Datanode process hangs if it does not exit.
      LOG.warn("Exiting Datanode");
      terminate(errorCode);
    }
  }

调用createDataNode(args, null, resources)方法,创建datanode对象

进入createDataNode方法

/** Instantiate & Start a single datanode daemon and wait for it to finish.
   *  If this thread is specifically interrupted, it will stop waiting.
初始化并启动一个单datanode守护进程 然后等待它完成。
如果该线程被中断了,将停止等待
   */
  @VisibleForTesting
  @InterfaceAudience.Private
  public static DataNode createDataNode(String args[], Configuration conf,
      SecureResources resources) throws IOException {
    DataNode dn = instantiateDataNode(args, conf, resources);
    if (dn != null) {
      dn.runDatanodeDaemon();
    }
    return dn;
  }

实际上是调用instantiateDataNode方法构建的datanode对象实例,然后如果该对象不为空,就执行datanode对象的runDatanodeDaemon方法,启动datanode线程

先进入instantiateDataNode方法

/** Instantiate a single datanode object, along with its secure resources. 
   * This must be run by invoking{@link DataNode#runDatanodeDaemon()} 
   * subsequently. 
实例化datanode对象
   */
  public static DataNode instantiateDataNode(String args [], Configuration conf,
      SecureResources resources) throws IOException {
    if (conf == null)
      conf = new HdfsConfiguration();
    
    if (args != null) {
      // parse generic hadoop options
      GenericOptionsParser hParser = new GenericOptionsParser(conf, args);
      args = hParser.getRemainingArgs();
    }
    
    if (!parseArguments(args, conf)) {
      printUsage(System.err);
      return null;
    }
    //根据配置信息得到我们的数据在datanode上的存储位置,返回的是一个StorageLocation泛型的集合
    Collection<StorageLocation> dataLocations = getStorageLocations(conf);
    UserGroupInformation.setConfiguration(conf);
    SecurityUtil.login(conf, DFS_DATANODE_KEYTAB_FILE_KEY,
        DFS_DATANODE_KERBEROS_PRINCIPAL_KEY, getHostName(conf));
    return makeInstance(dataLocations, conf, resources);
  }

又实际上,是调用makeInstance方法构建datanode实例

static DataNode makeInstance(Collection<StorageLocation> dataDirs,
      Configuration conf, SecureResources resources) throws IOException {
    List<StorageLocation> locations;
    StorageLocationChecker storageLocationChecker =
        new StorageLocationChecker(conf, new Timer());
    try {
      locations = storageLocationChecker.check(conf, dataDirs);
    } catch (InterruptedException ie) {
      throw new IOException("Failed to instantiate DataNode", ie);
    }
    DefaultMetricsSystem.initialize("DataNode");

    assert locations.size() > 0 : "number of data directories should be > 0";
    return new DataNode(conf, locations, storageLocationChecker, resources);
  }

果然,在makeInstance方法里,返回了DataNode对象,所以,我们进入DataNode构造方法

/**
   * Create the DataNode given a configuration, an array of dataDirs,
   * and a namenode proxy.
   */
  DataNode(final Configuration conf,
           final List<StorageLocation> dataDirs,
           final StorageLocationChecker storageLocationChecker,
           final SecureResources resources) throws IOException {
    super(conf);
    this.tracer = createTracer(conf);
    this.tracerConfigurationManager =
        new TracerConfigurationManager(DATANODE_HTRACE_PREFIX, conf);
    this.fileIoProvider = new FileIoProvider(conf, this);
    this.blockScanner = new BlockScanner(this);
    this.lastDiskErrorCheck = 0;
    this.maxNumberOfBlocksToLog = conf.getLong(DFS_MAX_NUM_BLOCKS_TO_LOG_KEY,
        DFS_MAX_NUM_BLOCKS_TO_LOG_DEFAULT);

    this.usersWithLocalPathAccess = Arrays.asList(
        conf.getTrimmedStrings(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY));
    this.connectToDnViaHostname = conf.getBoolean(
        DFSConfigKeys.DFS_DATANODE_USE_DN_HOSTNAME,
        DFSConfigKeys.DFS_DATANODE_USE_DN_HOSTNAME_DEFAULT);
    this.supergroup = conf.get(DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY,
        DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
    this.isPermissionEnabled = conf.getBoolean(
        DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY,
        DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT);
    this.pipelineSupportECN = conf.getBoolean(
        DFSConfigKeys.DFS_PIPELINE_ECN_ENABLED,
        DFSConfigKeys.DFS_PIPELINE_ECN_ENABLED_DEFAULT);

    confVersion = "core-" +
        conf.get("hadoop.common.configuration.version", "UNSPECIFIED") +
        ",hdfs-" +
        conf.get("hadoop.hdfs.configuration.version", "UNSPECIFIED");

    this.volumeChecker = new DatasetVolumeChecker(conf, new Timer());

    // Determine whether we should try to pass file descriptors to clients.
    if (conf.getBoolean(HdfsClientConfigKeys.Read.ShortCircuit.KEY,
              HdfsClientConfigKeys.Read.ShortCircuit.DEFAULT)) {
      String reason = DomainSocket.getLoadingFailureReason();
      if (reason != null) {
        LOG.warn("File descriptor passing is disabled because {}", reason);
        this.fileDescriptorPassingDisabledReason = reason;
      } else {
        LOG.info("File descriptor passing is enabled.");
        this.fileDescriptorPassingDisabledReason = null;
      }
    } else {
      this.fileDescriptorPassingDisabledReason =
          "File descriptor passing was not configured.";
      LOG.debug(this.fileDescriptorPassingDisabledReason);
    }

    this.socketFactory = NetUtils.getDefaultSocketFactory(conf);

    try {
      hostName = getHostName(conf);
      LOG.info("Configured hostname is {}", hostName);
      startDataNode(dataDirs, resources);
    } catch (IOException ie) {
      shutdown();
      throw ie;
    }
    final int dncCacheMaxSize =
        conf.getInt(DFS_DATANODE_NETWORK_COUNTS_CACHE_MAX_SIZE_KEY,
            DFS_DATANODE_NETWORK_COUNTS_CACHE_MAX_SIZE_DEFAULT) ;
    datanodeNetworkCounts =
        CacheBuilder.newBuilder()
            .maximumSize(dncCacheMaxSize)
            .build(new CacheLoader<String, Map<String, Long>>() {
              @Override
              public Map<String, Long> load(String key) throws Exception {
                final Map<String, Long> ret = new HashMap<String, Long>();
                ret.put("networkErrors", 0L);
                return ret;
              }
            });

    initOOBTimeout();
    this.storageLocationChecker = storageLocationChecker;
  }

startDataNode方法被我们发现了,这个方法里面就是DataNode的具体启动流程,进去看看

void startDataNode(List<StorageLocation> dataDirectories,
                     SecureResources resources
                     ) throws IOException {

    // settings global for all BPs in the Data Node
    this.secureResources = resources;
    synchronized (this) {
      this.dataDirs = dataDirectories;
    }
    this.dnConf = new DNConf(this);
    checkSecureConfig(dnConf, getConf(), resources);

    if (dnConf.maxLockedMemory > 0) {
      if (!NativeIO.POSIX.getCacheManipulator().verifyCanMlock()) {
        throw new RuntimeException(String.format(
            "Cannot start datanode because the configured max locked memory" +
            " size (%s) is greater than zero and native code is not available.",
            DFS_DATANODE_MAX_LOCKED_MEMORY_KEY));
      }
      if (Path.WINDOWS) {
        NativeIO.Windows.extendWorkingSetSize(dnConf.maxLockedMemory);
      } else {
        long ulimit = NativeIO.POSIX.getCacheManipulator().getMemlockLimit();
        if (dnConf.maxLockedMemory > ulimit) {
          throw new RuntimeException(String.format(
            "Cannot start datanode because the configured max locked memory" +
            " size (%s) of %d bytes is more than the datanode's available" +
            " RLIMIT_MEMLOCK ulimit of %d bytes.",
            DFS_DATANODE_MAX_LOCKED_MEMORY_KEY,
            dnConf.maxLockedMemory,
            ulimit));
        }
      }
    }
    LOG.info("Starting DataNode with maxLockedMemory = {}",
        dnConf.maxLockedMemory);

    int volFailuresTolerated = dnConf.getVolFailuresTolerated();
    int volsConfigured = dnConf.getVolsConfigured();
    if (volFailuresTolerated < MAX_VOLUME_FAILURE_TOLERATED_LIMIT
        || volFailuresTolerated >= volsConfigured) {
      throw new DiskErrorException("Invalid value configured for "
          + "dfs.datanode.failed.volumes.tolerated - " + volFailuresTolerated
          + ". Value configured is either less than -1 or >= "
          + "to the number of configured volumes (" + volsConfigured + ").");
    }

    storage = new DataStorage();
    
    // global DN settings
    registerMXBean();
    //初始化DataXceiver服务
    initDataXceiver();
    //初始化HTTP服务
    startInfoServer();
    pauseMonitor = new JvmPauseMonitor();
    pauseMonitor.init(getConf());
    pauseMonitor.start();
  
    // BlockPoolTokenSecretManager is required to create ipc server.
    this.blockPoolTokenSecretManager = new BlockPoolTokenSecretManager();

    // Login is done by now. Set the DN user name.
    dnUserName = UserGroupInformation.getCurrentUser().getUserName();
    LOG.info("dnUserName = {}", dnUserName);
    LOG.info("supergroup = {}", supergroup);
    //初始化DataNode的RPC服务端
    initIpcServer();

    metrics = DataNodeMetrics.create(getConf(), getDisplayName());
    peerMetrics = dnConf.peerStatsEnabled ?
        DataNodePeerMetrics.create(getDisplayName(), getConf()) : null;
    metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);

    ecWorker = new ErasureCodingWorker(getConf(), this);
    blockRecoveryWorker = new BlockRecoveryWorker(this);

    blockPoolManager = new BlockPoolManager(this);
    //DataNode向NameNode注册
    blockPoolManager.refreshNamenodes(getConf());

    // Create the ReadaheadPool from the DataNode context so we can
    // exit without having to explicitly shutdown its thread pool.
    readaheadPool = ReadaheadPool.getInstance();
    saslClient = new SaslDataTransferClient(dnConf.getConf(),
        dnConf.saslPropsResolver, dnConf.trustedChannelResolver);
    saslServer = new SaslDataTransferServer(dnConf, blockPoolTokenSecretManager);
    startMetricsLogger();

    if (dnConf.diskStatsEnabled) {
      diskMetrics = new DataNodeDiskMetrics(this,
          dnConf.outliersReportIntervalMs);
    }
  }

3.2.初始化DataXceiver服务,用于接收客户端Streamer的socket请求或其他DataNode发送过来的socket请求

private void initDataXceiver() throws IOException {
    // find free port or use privileged port provided
    TcpPeerServer tcpPeerServer;
    if (secureResources != null) {
      tcpPeerServer = new TcpPeerServer(secureResources);
    } else {
      int backlogLength = getConf().getInt(
          CommonConfigurationKeysPublic.IPC_SERVER_LISTEN_QUEUE_SIZE_KEY,
          CommonConfigurationKeysPublic.IPC_SERVER_LISTEN_QUEUE_SIZE_DEFAULT);
      tcpPeerServer = new TcpPeerServer(dnConf.socketWriteTimeout,
          DataNode.getStreamingAddr(getConf()), backlogLength);
    }
    if (dnConf.getTransferSocketRecvBufferSize() > 0) {
      tcpPeerServer.setReceiveBufferSize(
          dnConf.getTransferSocketRecvBufferSize());
    }
    streamingAddr = tcpPeerServer.getStreamingAddr();
    LOG.info("Opened streaming server at {}", streamingAddr);
    this.threadGroup = new ThreadGroup("dataXceiverServer");
    xserver = new DataXceiverServer(tcpPeerServer, getConf(), this);
    //这里可以看到dataXceiver是一个服务
    this.dataXceiverServer = new Daemon(threadGroup, xserver);
    this.threadGroup.setDaemon(true); // auto destroy when empty

    if (getConf().getBoolean(
        HdfsClientConfigKeys.Read.ShortCircuit.KEY,
        HdfsClientConfigKeys.Read.ShortCircuit.DEFAULT) ||
        getConf().getBoolean(
            HdfsClientConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC,
            HdfsClientConfigKeys
              .DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC_DEFAULT)) {
      DomainPeerServer domainPeerServer =
                getDomainPeerServer(getConf(), streamingAddr.getPort());
      if (domainPeerServer != null) {
        this.localDataXceiverServer = new Daemon(threadGroup,
            new DataXceiverServer(domainPeerServer, getConf(), this));
        LOG.info("Listening on UNIX domain socket: {}",
            domainPeerServer.getBindPath());
      }
    }
    this.shortCircuitRegistry = new ShortCircuitRegistry(getConf());
  }

3.3.初始化HTTP服务

private void startInfoServer()
    throws IOException {
    // SecureDataNodeStarter will bind the privileged port to the channel if
    // the DN is started by JSVC, pass it along.
    ServerSocketChannel httpServerChannel = secureResources != null ?
        secureResources.getHttpServerChannel() : null;
    //就是这里,构建datanodeHttp服务对象,并启动
    httpServer = new DatanodeHttpServer(getConf(), this, httpServerChannel);
    httpServer.start();
    if (httpServer.getHttpAddress() != null) {
      infoPort = httpServer.getHttpAddress().getPort();
    }
    if (httpServer.getHttpsAddress() != null) {
      infoSecurePort = httpServer.getHttpsAddress().getPort();
    }
  }
public DatanodeHttpServer(final Configuration conf,
      final DataNode datanode,
      final ServerSocketChannel externalHttpChannel)
    throws IOException {
    ...

    HttpServer2.Builder builder = new HttpServer2.Builder()
        .setName("datanode")
        .setConf(confForInfoServer)
        .setACL(new AccessControlList(conf.get(DFS_ADMIN, " ")))
        .hostName(getHostnameForSpnegoPrincipal(confForInfoServer))
        .addEndpoint(URI.create("http://localhost:" + proxyPort))
        .setFindPort(true);    


}

3.4.初始化DataNode的RPC服务端

private void initIpcServer() throws IOException {
    ...

    ipcServer = new RPC.Builder(getConf())
        .setProtocol(ClientDatanodeProtocolPB.class)
        .setInstance(service)
        .setBindAddress(ipcAddr.getHostName())
        .setPort(ipcAddr.getPort())
        .setNumHandlers(
            getConf().getInt(DFS_DATANODE_HANDLER_COUNT_KEY,
                DFS_DATANODE_HANDLER_COUNT_DEFAULT)).setVerbose(false)
        .setSecretManager(blockPoolTokenSecretManager).build();
    ...
}

3.5.DataNode向NameNode注册

void refreshNamenodes(Configuration conf)
      throws IOException {
    LOG.info("Refresh request received for nameservices: " +
        conf.get(DFSConfigKeys.DFS_NAMESERVICES));

    Map<String, Map<String, InetSocketAddress>> newAddressMap = null;
    Map<String, Map<String, InetSocketAddress>> newLifelineAddressMap = null;

    try {
      newAddressMap =
          DFSUtil.getNNServiceRpcAddressesForCluster(conf);
      newLifelineAddressMap =
          DFSUtil.getNNLifelineRpcAddressesForCluster(conf);
    } catch (IOException ioe) {
      LOG.warn("Unable to get NameNode addresses.");
    }

    if (newAddressMap == null || newAddressMap.isEmpty()) {
      throw new IOException("No services to connect, missing NameNode " +
          "address.");
    }

    synchronized (refreshNamenodesLock) {
      //就是这个方法dn向nn注册
      doRefreshNamenodes(newAddressMap, newLifelineAddressMap);
    }
  }
private void doRefreshNamenodes(
      Map<String, Map<String, InetSocketAddress>> addrMap,
      Map<String, Map<String, InetSocketAddress>> lifelineAddrMap)
      throws IOException {
       ...

      
      // Step 3. Start new nameservices
      if (!toAdd.isEmpty()) {
        LOG.info("Starting BPOfferServices for nameservices: " +
            Joiner.on(",").useForNull("<default>").join(toAdd));
      
        for (String nsToAdd : toAdd) {
          ...
          BPOfferService bpos = createBPOS(nsToAdd, addrs, lifelineAddrs);
          bpByNameserviceId.put(nsToAdd, bpos);
          offerServices.add(bpos);
        }
      }
      startAll();
    }

protected BPOfferService createBPOS(
      final String nameserviceId,
      List<InetSocketAddress> nnAddrs,
      List<InetSocketAddress> lifelineNnAddrs) {
    return new BPOfferService(nameserviceId, nnAddrs, lifelineNnAddrs, dn);
  }
BPOfferService(
      final String nameserviceId,
      List<InetSocketAddress> nnAddrs,
      List<InetSocketAddress> lifelineNnAddrs,
      DataNode dn) {
    Preconditions.checkArgument(!nnAddrs.isEmpty(),
        "Must pass at least one NN.");
    Preconditions.checkArgument(nnAddrs.size() == lifelineNnAddrs.size(),
        "Must pass same number of NN addresses and lifeline addresses.");
    this.nameserviceId = nameserviceId;
    this.dn = dn;

    for (int i = 0; i < nnAddrs.size(); ++i) {
      //有多少个NameNode就创建多少个服务,因为datanode需要向每个namenode都进行注册
      this.bpServices.add(new BPServiceActor(nnAddrs.get(i),
          lifelineNnAddrs.get(i), this));
    }
  }

 BlockPoolManager.java

synchronized void startAll() throws IOException {
    try {
      UserGroupInformation.getLoginUser().doAs(
          new PrivilegedExceptionAction<Object>() {
            @Override
            public Object run() throws Exception {
              for (BPOfferService bpos : offerServices) {
                //启动服务,等待接收dn的注册
                bpos.start();
              }
              return null;
            }
          });
    } catch (InterruptedException ex) {
      IOException ioe = new IOException();
      ioe.initCause(ex.getCause());
      throw ioe;
    }
  }

BPOfferService.java

//This must be called only by blockPoolManager
  void start() {
    for (BPServiceActor actor : bpServices) {
      actor.start();
    }
  }

BPServiceActor.java

//This must be called only by BPOfferService
  void start() {
    if ((bpThread != null) && (bpThread.isAlive())) {
      //Thread is started already
      return;
    }
    bpThread = new Thread(this);
    bpThread.setDaemon(true); // needed for JUnit testing
    //找到该线程的run执行体
    bpThread.start();

    if (lifelineSender != null) {
      lifelineSender.start();
    }
  }
@Override
  public void run() {
    LOG.info(this + " starting to offer service");

    try {
      while (true) {
        // init stuff
        try {
          // setup storage 向namenode注册
          connectToNNAndHandshake();
          break;
        } catch (IOException ioe) {
          // Initial handshake, storage recovery or registration failed
          runningState = RunningState.INIT_FAILED;
          if (shouldRetryInit()) {
            // Retry until all namenode's of BPOS failed initialization
            LOG.error("Initialization failed for " + this + " "
                + ioe.getLocalizedMessage());
            //注册失败 5s后重试
            sleepAndLogInterrupts(5000, "initializing");
          } else {
            runningState = RunningState.FAILED;
            LOG.error("Initialization failed for " + this + ". Exiting. ", ioe);
            return;
          }
        }
      }

      runningState = RunningState.RUNNING;
      if (initialRegistrationComplete != null) {
        initialRegistrationComplete.countDown();
      }

      while (shouldRun()) {
        try {
          //发送心跳
          offerService();
        } catch (Exception ex) {
          LOG.error("Exception in BPOfferService for " + this, ex);
          sleepAndLogInterrupts(5000, "offering service");
        }
      }
      runningState = RunningState.EXITED;
    } catch (Throwable ex) {
      LOG.warn("Unexpected exception in block pool " + this, ex);
      runningState = RunningState.FAILED;
    } finally {
      LOG.warn("Ending block pool service for: " + this);
      cleanUp();
    }
  }
private void connectToNNAndHandshake() throws IOException {
    // get NN proxy  获取namenode的PRC客户端对象
    bpNamenode = dn.connectToNN(nnAddr);

    // First phase of the handshake with NN - get the namespace
    // info.
    NamespaceInfo nsInfo = retrieveNamespaceInfo();

    // Verify that this matches the other NN in this HA pair.
    // This also initializes our block pool in the DN if we are
    // the first NN connection for this BP.
    bpos.verifyAndSetNamespaceInfo(this, nsInfo);

    /* set thread name again to include NamespaceInfo when it's available. */
    this.bpThread.setName(formatThreadName("heartbeating", nnAddr));

    // Second phase of the handshake with the NN.  注册
    register(nsInfo);
  }

DataNode.java 

DatanodeProtocolClientSideTranslatorPB connectToNN(
      InetSocketAddress nnAddr) throws IOException {
    return new DatanodeProtocolClientSideTranslatorPB(nnAddr, getConf());
  }

DatanodeProtocolClientSideTranslatorPB.java

public DatanodeProtocolClientSideTranslatorPB(InetSocketAddress nameNodeAddr,
      Configuration conf) throws IOException {
    RPC.setProtocolEngine(conf, DatanodeProtocolPB.class,
        ProtobufRpcEngine.class);
    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
    //得到namenode的RPC客户端代理
    rpcProxy = createNamenode(nameNodeAddr, conf, ugi);
  }

private static DatanodeProtocolPB createNamenode(
      InetSocketAddress nameNodeAddr, Configuration conf,
      UserGroupInformation ugi) throws IOException {
    return RPC.getProxy(DatanodeProtocolPB.class,
        RPC.getProtocolVersion(DatanodeProtocolPB.class), nameNodeAddr, ugi,
        conf, NetUtils.getSocketFactory(conf, DatanodeProtocolPB.class));
  }

上面就是很经典的得到RPC客户端代理的方法RPC.getProxy

然后就是进入真正的注册流程,即register方法

void register(NamespaceInfo nsInfo) throws IOException {
    // The handshake() phase loaded the block pool storage
    // off disk - so update the bpRegistration object from that info
    DatanodeRegistration newBpRegistration = bpos.createRegistration();

    LOG.info(this + " beginning handshake with NN");

    while (shouldRun()) {
      try {
        // Use returned registration from namenode with updated fields
        //这一块就是通过RPC远程调用DataNodeProtocol接口里面的registerDatanode方法,
        //实际方法的实现是在NameNode PRC 服务端实现的
        newBpRegistration = bpNamenode.registerDatanode(newBpRegistration);
        newBpRegistration.setNamespaceInfo(nsInfo);
        bpRegistration = newBpRegistration;
        break;
      } catch(EOFException e) {  // namenode might have just restarted
        LOG.info("Problem connecting to server: " + nnAddr + " :"
            + e.getLocalizedMessage());
        sleepAndLogInterrupts(1000, "connecting to server");
      } catch(SocketTimeoutException e) {  // namenode is busy
        LOG.info("Problem connecting to server: " + nnAddr);
        sleepAndLogInterrupts(1000, "connecting to server");
      }
    }
    
    LOG.info("Block pool " + this + " successfully registered with NN");
    bpos.registrationSucceeded(this, bpRegistration);

    // reset lease id whenever registered to NN.
    // ask for a new lease id at the next heartbeat.
    fullBlockReportLeaseId = 0;

    // random short delay - helps scatter the BR from all DNs
    scheduler.scheduleBlockReport(dnConf.initialBlockReportDelayMs);
  }

 DatanodeProtocolClientSideTranslatorPB.java

@Override
  public DatanodeRegistration registerDatanode(DatanodeRegistration registration
      ) throws IOException {
    RegisterDatanodeRequestProto.Builder builder = RegisterDatanodeRequestProto
        .newBuilder().setRegistration(PBHelper.convert(registration));
    RegisterDatanodeResponseProto resp;
    try {
      resp = rpcProxy.registerDatanode(NULL_CONTROLLER, builder.build());
    } catch (ServiceException se) {
      throw ProtobufHelper.getRemoteException(se);
    }
    return PBHelper.convert(resp.getRegistration());
  }

可以看到是通过rpcProxy也就是NN的PRC客户端调用的注册方法

DataNodeProtocol.java

public DatanodeRegistration registerDatanode(DatanodeRegistration registration
      ) throws IOException;

实际实现是在NameNodeRpcServer.java中

public DatanodeRegistration registerDatanode(DatanodeRegistration nodeReg)
      throws IOException {
    checkNNStartup();
    verifySoftwareVersion(nodeReg);
    namesystem.registerDatanode(nodeReg);
    return nodeReg;
  }

FSnamesystem.java

void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
    writeLock();
    try {
      blockManager.registerDatanode(nodeReg);
    } finally {
      writeUnlock("registerDatanode");
    }
  }

BlockManager.java

public void registerDatanode(DatanodeRegistration nodeReg)
      throws IOException {
    assert namesystem.hasWriteLock();
    datanodeManager.registerDatanode(nodeReg);
    bmSafeMode.checkSafeMode();
  }

DatanodeManager.java

public void registerDatanode(DatanodeRegistration nodeReg)
      throws DisallowedDatanodeException, UnresolvedTopologyException {
       ...
        // register new datanode 注册DN
        addDatanode(nodeDescr);
        blockManager.getBlockReportLeaseManager().register(nodeDescr);
        // also treat the registration message as a heartbeat
        // no need to update its timestamp
        // because its is done when the descriptor is created
        //将DN加入到心跳管理
        heartbeatManager.addDatanode(nodeDescr);
        heartbeatManager.updateDnStat(nodeDescr);
        incrementVersionCount(nodeReg.getSoftwareVersion());
        startAdminOperationIfNecessary(nodeDescr);
        success = true;
         ...
  }


void addDatanode(final DatanodeDescriptor node) {
    // To keep host2DatanodeMap consistent with datanodeMap,
    // remove  from host2DatanodeMap the datanodeDescriptor removed
    // from datanodeMap before adding node to host2DatanodeMap.
    synchronized(this) {
      host2DatanodeMap.remove(datanodeMap.put(node.getDatanodeUuid(), node));
    }

    networktopology.add(node); // may throw InvalidTopologyException
    //datanode启动的时候,datanode会向namenode注册,这时候就会将主机和datanode映射信息加入到            //元数据里
    host2DatanodeMap.add(node);
    checkIfClusterIsNowMultiRack(node);
    resolveUpgradeDomain(node);

    if (LOG.isDebugEnabled()) {
      LOG.debug(getClass().getSimpleName() + ".addDatanode: "
          + "node " + node + " is added to datanodeMap.");
    }
  }

3.6.向NN发送心跳

BPServiceActor.java

private void offerService() throws Exception {
       ...

    //
    // Now loop for a long time....
    //
    while (shouldRun()) {
      try {
        DataNodeFaultInjector.get().startOfferService();
        final long startTime = scheduler.monotonicNow();

        //
        // Every so often, send heartbeat or block-report
        //
        final boolean sendHeartbeat = scheduler.isHeartbeatDue(startTime);
        HeartbeatResponse resp = null;
        if (sendHeartbeat) {
          //
          // All heartbeat messages include following info:
          // -- Datanode name
          // -- data transfer port
          // -- Total capacity
          // -- Bytes remaining
          //
          boolean requestBlockReportLease = (fullBlockReportLeaseId == 0) &&
                  scheduler.isBlockReportDue(startTime);
          if (!dn.areHeartbeatsDisabledForTests()) {
            //发送心跳信息
            resp = sendHeartBeat(requestBlockReportLease);
            assert resp != null;
            if (resp.getFullBlockReportLeaseId() != 0) {
              if (fullBlockReportLeaseId != 0) {
                LOG.warn(nnAddr + " sent back a full block report lease " +
                        "ID of 0x" +
                        Long.toHexString(resp.getFullBlockReportLeaseId()) +
                        ", but we already have a lease ID of 0x" +
                        Long.toHexString(fullBlockReportLeaseId) + ". " +
                        "Overwriting old lease ID.");
              }
              fullBlockReportLeaseId = resp.getFullBlockReportLeaseId();
            }
            dn.getMetrics().addHeartbeat(scheduler.monotonicNow() - startTime);

            // If the state of this NN has changed (eg STANDBY->ACTIVE)
            // then let the BPOfferService update itself.
            //
            // Important that this happens before processCommand below,
            // since the first heartbeat to a new active might have commands
            // that we should actually process.
            bpos.updateActorStatesFromHeartbeat(
                this, resp.getNameNodeHaState());
            state = resp.getNameNodeHaState().getState();

            if (state == HAServiceState.ACTIVE) {
              handleRollingUpgradeStatus(resp);
            }

            long startProcessCommands = monotonicNow();
            if (!processCommand(resp.getCommands()))
              continue;
            long endProcessCommands = monotonicNow();
            if (endProcessCommands - startProcessCommands > 2000) {
              LOG.info("Took " + (endProcessCommands - startProcessCommands)
                  + "ms to process " + resp.getCommands().length
                  + " commands from NN");
            }
          }
        }
        if (!dn.areIBRDisabledForTests() &&
            (ibrManager.sendImmediately()|| sendHeartbeat)) {
          ibrManager.sendIBRs(bpNamenode, bpRegistration,
              bpos.getBlockPoolId());
        }

        List<DatanodeCommand> cmds = null;
        boolean forceFullBr =
            scheduler.forceFullBlockReport.getAndSet(false);
        if (forceFullBr) {
          LOG.info("Forcing a full block report to " + nnAddr);
        }
        if ((fullBlockReportLeaseId != 0) || forceFullBr) {
          cmds = blockReport(fullBlockReportLeaseId);
          fullBlockReportLeaseId = 0;
        }
        processCommand(cmds == null ? null : cmds.toArray(new DatanodeCommand[cmds.size()]));

        if (!dn.areCacheReportsDisabledForTests()) {
          DatanodeCommand cmd = cacheReport();
          processCommand(new DatanodeCommand[]{ cmd });
        }

        if (sendHeartbeat) {
          dn.getMetrics().addHeartbeatTotal(
              scheduler.monotonicNow() - startTime);
        }

        // There is no work to do;  sleep until hearbeat timer elapses, 
        // or work arrives, and then iterate again.
        ibrManager.waitTillNextIBR(scheduler.getHeartbeatWaitTime());
      } catch(RemoteException re) {
        String reClass = re.getClassName();
        if (UnregisteredNodeException.class.getName().equals(reClass) ||
            DisallowedDatanodeException.class.getName().equals(reClass) ||
            IncorrectVersionException.class.getName().equals(reClass)) {
          LOG.warn(this + " is shutting down", re);
          shouldServiceRun = false;
          return;
        }
        LOG.warn("RemoteException in offerService", re);
        sleepAfterException();
      } catch (IOException e) {
        LOG.warn("IOException in offerService", e);
        sleepAfterException();
      } finally {
        DataNodeFaultInjector.get().endOfferService();
      }
      processQueueMessages();
    } // while (shouldRun())
  } // offerService

 HeartbeatResponse sendHeartBeat(boolean requestBlockReportLease)
      throws IOException {
    ...
    //通过nn的客户端代理调用发送心跳的方法
    HeartbeatResponse response = bpNamenode.sendHeartbeat(bpRegistration,
        reports,
        dn.getFSDataset().getCacheCapacity(), //发送缓存容量
        dn.getFSDataset().getCacheUsed(), //发送已使用容量
        dn.getXmitsInProgress(),
        dn.getXceiverCount(),
        numFailedVolumes,
        volumeFailureSummary,
        requestBlockReportLease,
        slowPeers,
        slowDisks);

    if (outliersReportDue) {
      // If the report was due and successfully sent, schedule the next one.
      scheduler.scheduleNextOutlierReport();
    }
    return response;
  }

NamenodeRpcServer.java

@Override // DatanodeProtocol
  public HeartbeatResponse sendHeartbeat(DatanodeRegistration nodeReg,
      StorageReport[] report, long dnCacheCapacity, long dnCacheUsed,
      int xmitsInProgress, int xceiverCount,
      int failedVolumes, VolumeFailureSummary volumeFailureSummary,
      boolean requestFullBlockReportLease,
      @Nonnull SlowPeerReports slowPeers,
      @Nonnull SlowDiskReports slowDisks) throws IOException {
    checkNNStartup();
    verifyRequest(nodeReg);
    return namesystem.handleHeartbeat(nodeReg, report,
        dnCacheCapacity, dnCacheUsed, xceiverCount, xmitsInProgress,
        failedVolumes, volumeFailureSummary, requestFullBlockReportLease,
        slowPeers, slowDisks);
  }

FSNamesystem.java


HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
      StorageReport[] reports, long cacheCapacity, long cacheUsed,
      int xceiverCount, int xmitsInProgress, int failedVolumes,
      VolumeFailureSummary volumeFailureSummary,
      boolean requestFullBlockReportLease,
      @Nonnull SlowPeerReports slowPeers,
      @Nonnull SlowDiskReports slowDisks) throws IOException {
    readLock();
    try {
      //get datanode commands
      final int maxTransfer = blockManager.getMaxReplicationStreams()
          - xmitsInProgress;
      //处理dn发送过来的心跳
      DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
          nodeReg, reports, getBlockPoolId(), cacheCapacity, cacheUsed,
          xceiverCount, maxTransfer, failedVolumes, volumeFailureSummary,
          slowPeers, slowDisks);
      long blockReportLeaseId = 0;
      if (requestFullBlockReportLease) {
        blockReportLeaseId =  blockManager.requestBlockReportLeaseId(nodeReg);
      }
      //create ha status
      final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat(
          haContext.getState().getServiceState(),
          getFSImage().getCorrectLastAppliedOrWrittenTxId());
      //响应dn的心跳,里面包括要发送给dn的命令
      return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo,
          blockReportLeaseId);
    } finally {
      readUnlock("handleHeartbeat");
    }
  }

DatanodeManager.java

public DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg,
 StorageReport[] reports, final String blockPoolId,
 long cacheCapacity, long cacheUsed, int xceiverCount, 
 int maxTransfers, int failedVolumes,
 VolumeFailureSummary volumeFailureSummary,
 @Nonnull SlowPeerReports slowPeers,
 @Nonnull SlowDiskReports slowDisks) throws IOException {
 ... ...
 heartbeatManager.updateHeartbeat(nodeinfo, reports, cacheCapacity,
 cacheUsed, xceiverCount, failedVolumes, volumeFailureSummary);
 ... ... 
}

HeartbeatManager.java

synchronized void updateHeartbeat(final DatanodeDescriptor node,
      StorageReport[] reports, long cacheCapacity, long cacheUsed,
      int xceiverCount, int failedVolumes,
      VolumeFailureSummary volumeFailureSummary) {
    stats.subtract(node);
    blockManager.updateHeartbeat(node, reports, cacheCapacity, cacheUsed,
        xceiverCount, failedVolumes, volumeFailureSummary);
    stats.add(node);
  }

BlockManager.java

void updateHeartbeat(DatanodeDescriptor node, StorageReport[] reports,
      long cacheCapacity, long cacheUsed, int xceiverCount, int failedVolumes,
      VolumeFailureSummary volumeFailureSummary) {

    for (StorageReport report: reports) {
      providedStorageMap.updateStorage(node, report.getStorage());
    }
    node.updateHeartbeat(reports, cacheCapacity, cacheUsed, xceiverCount,
        failedVolumes, volumeFailureSummary);
  }

DatanodeDescriptor.java 

void updateHeartbeat(StorageReport[] reports, long cacheCapacity,
      long cacheUsed, int xceiverCount, int volFailures,
      VolumeFailureSummary volumeFailureSummary) {
    updateHeartbeatState(reports, cacheCapacity, cacheUsed, xceiverCount,
        volFailures, volumeFailureSummary);
    heartbeatedSinceRegistration = true;
  }

void updateHeartbeatState(StorageReport[] reports, long cacheCapacity,
      long cacheUsed, int xceiverCount, int volFailures,
      VolumeFailureSummary volumeFailureSummary) {
    //更新存储状态
    updateStorageStats(reports, cacheCapacity, cacheUsed, xceiverCount,
        volFailures, volumeFailureSummary);
    //更新心跳时间
    setLastUpdate(Time.now());
    setLastUpdateMonotonic(Time.monotonicNow());
    rollBlocksScheduled(getLastUpdateMonotonic());
  }


 private void updateStorageStats(StorageReport[] reports, long cacheCapacity,
      long cacheUsed, int xceiverCount, int volFailures,
      VolumeFailureSummary volumeFailureSummary) {
    long totalCapacity = 0;
    long totalRemaining = 0;
    long totalBlockPoolUsed = 0;
    long totalDfsUsed = 0;
    long totalNonDfsUsed = 0;

    setCacheCapacity(cacheCapacity);
    setCacheUsed(cacheUsed);
    setXceiverCount(xceiverCount);
    this.volumeFailures = volFailures;
    this.volumeFailureSummary = volumeFailureSummary;
    for (StorageReport report : reports) {

      DatanodeStorageInfo storage =
          storageMap.get(report.getStorage().getStorageID());
      if (checkFailedStorages) {
        failedStorageInfos.remove(storage);
      }

      storage.receivedHeartbeat(report);
      // skip accounting for capacity of PROVIDED storages!
      if (StorageType.PROVIDED.equals(storage.getStorageType())) {
        continue;
      }

      totalCapacity += report.getCapacity();
      totalRemaining += report.getRemaining();
      totalBlockPoolUsed += report.getBlockPoolUsed();
      totalDfsUsed += report.getDfsUsed();
      totalNonDfsUsed += report.getNonDfsUsed();
    }

    // Update total metrics for the node.
    setCapacity(totalCapacity);
    setRemaining(totalRemaining);
    setBlockPoolUsed(totalBlockPoolUsed);
    setDfsUsed(totalDfsUsed);
    setNonDfsUsed(totalNonDfsUsed);
    if (checkFailedStorages) {
      updateFailedStorage(failedStorageInfos);
    }
    long storageMapSize;
    synchronized (storageMap) {
      storageMapSize = storageMap.size();
    }
    if (storageMapSize != reports.length) {
      pruneStorageMap(reports);
    }
  }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

罗啰萝在努力

如果这篇文章对你有帮助,望支持

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值