The Process that dealing AM container fails

本文详细解析了YARN中资源管理的关键流程,包括节点心跳上报、容器状态更新及完成处理等过程。通过跟踪ResourceTrackerService中的nodeHeartbeat方法调用,展示了ResourceManager如何接收来自NodeManager的状态报告,并进一步说明了如何在FairScheduler中处理这些状态更新。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

ResourceTrackerService.nodeHeartbeat

If the AM container fails abruptly, the nodemanger will report it to the resource manager through heart beat.

  // 4. Send status to RMNode, saving the latest response.
    this.rmContext.getDispatcher().getEventHandler().handle(
        new RMNodeStatusEvent(nodeId, remoteNodeStatus.getNodeHealthStatus(),
            remoteNodeStatus.getContainersStatuses(), 
            remoteNodeStatus.getKeepAliveApplications(), nodeHeartBeatResponse));

    return nodeHeartBeatResponse;

#

RMNodeStatusEvent is processed at RMNodeImpl

.addTransition(NodeState.RUNNING,
         EnumSet.of(NodeState.RUNNING, NodeState.UNHEALTHY),
         RMNodeEventType.STATUS_UPDATE, new StatusUpdateWhenHealthyTransition())

StatusUpdateWhenHealthyTransition

     rmNode.handleContainerStatus(statusEvent.getContainers());

      if(rmNode.nextHeartBeat) {
        rmNode.nextHeartBeat = false;
        rmNode.context.getDispatcher().getEventHandler().handle(
            new NodeUpdateSchedulerEvent(rmNode));
      }

FairScheduler hanle of NodeUpdateSchedulerEvent

case NODE_UPDATE:
      if (!(event instanceof NodeUpdateSchedulerEvent)) {
        throw new RuntimeException("Unexpected event type: " + event);
      }
      NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
      nodeUpdate(nodeUpdatedEvent.getRMNode());
      break;

FairScheduler.nodeUpdate

/**
   * Process a heartbeat update from a node.
   */
  private synchronized void nodeUpdate(RMNode nm) {
    long start = getClock().getTime();
    if (LOG.isDebugEnabled()) {
      LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource);
    }
    eventLog.log("HEARTBEAT", nm.getHostName());
    FSSchedulerNode node = getFSSchedulerNode(nm.getNodeID());

    List<UpdatedContainerInfo> containerInfoList = nm.pullContainerUpdates();
    List<ContainerStatus> newlyLaunchedContainers = new ArrayList<ContainerStatus>();
    List<ContainerStatus> completedContainers = new ArrayList<ContainerStatus>();
    for(UpdatedContainerInfo containerInfo : containerInfoList) {
      newlyLaunchedContainers.addAll(containerInfo.getNewlyLaunchedContainers());
      completedContainers.addAll(containerInfo.getCompletedContainers());
    } 
    // Processing the newly launched containers
    for (ContainerStatus launchedContainer : newlyLaunchedContainers) {
      containerLaunchedOnNode(launchedContainer.getContainerId(), node);
    }

    // Process completed containers
    for (ContainerStatus completedContainer : completedContainers) {
      ContainerId containerId = completedContainer.getContainerId();
      LOG.debug("Container FINISHED: " + containerId);
      completedContainer(getRMContainer(containerId),
          completedContainer, RMContainerEventType.FINISHED);
    }

    if (continuousSchedulingEnabled) {
      if (!completedContainers.isEmpty()) {
        attemptScheduling(node);
      }
    } else {
      attemptScheduling(node);
    }

    long duration = getClock().getTime() - start;
    fsOpDurations.addNodeUpdateDuration(duration);
  }

FairScheduler. completedContainer

/**
   * Clean up a completed container.
   */
  @Override
  protected synchronized void completedContainer(RMContainer rmContainer,
      ContainerStatus containerStatus, RMContainerEventType event) {
    if (rmContainer == null) {
      LOG.info("Null container completed...");
      return;
    }

    Container container = rmContainer.getContainer();

    // Get the application for the finished container
    FSAppAttempt application =
        getCurrentAttemptForContainer(container.getId());
    ApplicationId appId =
        container.getId().getApplicationAttemptId().getApplicationId();
    if (application == null) {
      LOG.info("Container " + container + " of" +
          " unknown application attempt " + appId +
          " completed with event " + event);
      return;
    }

    // Get the node on which the container was allocated
    FSSchedulerNode node = getFSSchedulerNode(container.getNodeId());

    if (rmContainer.getState() == RMContainerState.RESERVED) {
      application.unreserve(rmContainer.getReservedPriority(), node);
    } else {
      application.containerCompleted(rmContainer, containerStatus, event);
      node.releaseContainer(container);
      updateRootQueueMetrics();
    }

    LOG.info("Application attempt " + application.getApplicationAttemptId()
        + " released container " + container.getId() + " on node: " + node
        + " with event: " + event);
  }

FsAttempt.containerCompleted

synchronized public void containerCompleted(RMContainer rmContainer,
      ContainerStatus containerStatus, RMContainerEventType event) {

    Container container = rmContainer.getContainer();
    ContainerId containerId = container.getId();

    // Remove from the list of newly allocated containers if found
    newlyAllocatedContainers.remove(rmContainer);

    // Inform the container
    rmContainer.handle(
        new RMContainerFinishedEvent(
            containerId,
            containerStatus, 
            event)
        );
    LOG.info("Completed container: " + rmContainer.getContainerId() + 
        " in state: " + rmContainer.getState() + " event:" + event);

    // Remove from the list of containers
    liveContainers.remove(rmContainer.getContainerId());

    RMAuditLogger.logSuccess(getUser(), 
        AuditConstants.RELEASE_CONTAINER, "SchedulerApp", 
        getApplicationId(), containerId);

    // Update usage metrics 
    Resource containerResource = rmContainer.getContainer().getResource();
    queue.getMetrics().releaseResources(getUser(), 1, containerResource);
    Resources.subtractFrom(currentConsumption, containerResource);

    // remove from preemption map if it is completed
    preemptionMap.remove(rmContainer);

    // Clear resource utilization metrics cache.
    lastMemoryAggregateAllocationUpdateTime = -1;
  }

RMContainerImpl. FinishedTransition handles RMContainerFinishedEvent

@Override
    public void transition(RMContainerImpl container, RMContainerEvent event) {
      RMContainerFinishedEvent finishedEvent = (RMContainerFinishedEvent) event;

      container.finishTime = System.currentTimeMillis();
      container.finishedStatus = finishedEvent.getRemoteContainerStatus();
      // Inform AppAttempt
      // container.getContainer() can return null when a RMContainer is a
      // reserved container
      updateAttemptMetrics(container);

      container.eventHandler.handle(new RMAppAttemptContainerFinishedEvent(
        container.appAttemptId, finishedEvent.getRemoteContainerStatus(),
          container.getAllocatedNode()));

      container.rmContext.getRMApplicationHistoryWriter().containerFinished(
        container);

      boolean saveNonAMContainerMetaInfo =
          container.rmContext.getYarnConfiguration().getBoolean(
              YarnConfiguration
                .APPLICATION_HISTORY_SAVE_NON_AM_CONTAINER_META_INFO,
              YarnConfiguration
                .DEFAULT_APPLICATION_HISTORY_SAVE_NON_AM_CONTAINER_META_INFO);

      if (saveNonAMContainerMetaInfo || container.isAMContainer()) {
        container.rmContext.getSystemMetricsPublisher().containerFinished(
            container, container.finishTime);
      }

    }

RMAppAttemptImpl.transition handle

 @Override
    public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
        RMAppAttemptEvent event) {

      RMAppAttemptContainerFinishedEvent containerFinishedEvent =
          (RMAppAttemptContainerFinishedEvent) event;
      ContainerStatus containerStatus =
          containerFinishedEvent.getContainerStatus();

      // Is this container the AmContainer? If the finished container is same as
      // the AMContainer, AppAttempt fails
      if (appAttempt.masterContainer != null
          && appAttempt.masterContainer.getId().equals(
              containerStatus.getContainerId())) {
        appAttempt.sendAMContainerToNM(appAttempt, containerFinishedEvent);

        // Remember the follow up transition and save the final attempt state.
        appAttempt.rememberTargetTransitionsAndStoreState(event,
            transitionToDo, RMAppAttemptState.FAILED, RMAppAttemptState.FAILED);
        return RMAppAttemptState.FINAL_SAVING;
      }

      // Add all finished containers so that they can be acked to NM
      addJustFinishedContainer(appAttempt, containerFinishedEvent);
      return this.currentState;
    }
  }

RMAppAttemptImpl.rememberTargetTransitionsAndStoreState

private void rememberTargetTransitionsAndStoreState(RMAppAttemptEvent event,
      Object transitionToDo, RMAppAttemptState targetFinalState,
      RMAppAttemptState stateToBeStored) {

    rememberTargetTransitions(event, transitionToDo, targetFinalState);
    stateBeforeFinalSaving = getState();

    // As of today, finalState, diagnostics, final-tracking-url and
    // finalAppStatus are the only things that we store into the StateStore
    // AFTER the initial saving on app-attempt-start
    // These fields can be visible from outside only after they are saved in
    // StateStore
    String diags = null;

    // don't leave the tracking URL pointing to a non-existent AM
    if (conf.getBoolean(YarnConfiguration.APPLICATION_HISTORY_ENABLED,
            YarnConfiguration.DEFAULT_APPLICATION_HISTORY_ENABLED)) {
      setTrackingUrlToAHSPage(stateToBeStored);
    } else {
      setTrackingUrlToRMAppPage(stateToBeStored);
    }
    String finalTrackingUrl = getOriginalTrackingUrl();
    FinalApplicationStatus finalStatus = null;
    int exitStatus = ContainerExitStatus.INVALID;
    switch (event.getType()) {
    case LAUNCH_FAILED:
      diags = event.getDiagnosticMsg();
      break;
    case REGISTERED:
      diags = getUnexpectedAMRegisteredDiagnostics();
      break;
    case UNREGISTERED:
      RMAppAttemptUnregistrationEvent unregisterEvent =
          (RMAppAttemptUnregistrationEvent) event;
      diags = unregisterEvent.getDiagnosticMsg();
      // reset finalTrackingUrl to url sent by am
      finalTrackingUrl = sanitizeTrackingUrl(unregisterEvent.getFinalTrackingUrl());
      finalStatus = unregisterEvent.getFinalApplicationStatus();
      break;
    case CONTAINER_FINISHED:
      RMAppAttemptContainerFinishedEvent finishEvent =
          (RMAppAttemptContainerFinishedEvent) event;
      diags = getAMContainerCrashedDiagnostics(finishEvent);
      exitStatus = finishEvent.getContainerStatus().getExitStatus();
      break;
    case KILL:
      break;
    case EXPIRE:
      diags = getAMExpiredDiagnostics(event);
      break;
    default:
      break;
    }
    AggregateAppResourceUsage resUsage =
        this.attemptMetrics.getAggregateAppResourceUsage();
    RMStateStore rmStore = rmContext.getStateStore();
    setFinishTime(System.currentTimeMillis());

    ApplicationAttemptStateData attemptState =
        ApplicationAttemptStateData.newInstance(
            applicationAttemptId,  getMasterContainer(),
            rmStore.getCredentialsFromAppAttempt(this),
            startTime, stateToBeStored, finalTrackingUrl, diags,
            finalStatus, exitStatus,
          getFinishTime(), resUsage.getMemorySeconds(),
          resUsage.getVcoreSeconds());
    LOG.info("Updating application attempt " + applicationAttemptId
        + " with final state: " + targetedFinalState + ", and exit status: "
        + exitStatus);
    rmStore.updateApplicationAttemptState(attemptState);
  }

RMStateStore

  @SuppressWarnings("unchecked")
  public void updateApplicationAttemptState(
      ApplicationAttemptStateData attemptState) {
    dispatcher.getEventHandler().handle(
      new RMStateUpdateAppAttemptEvent(attemptState));
  }

RMAppAttemptImpl.FINAL_SAVING state transition

  .addTransition(RMAppAttemptState.FINAL_SAVING,
          EnumSet.of(RMAppAttemptState.FINISHING, RMAppAttemptState.FAILED,
            RMAppAttemptState.KILLED, RMAppAttemptState.FINISHED),
            RMAppAttemptEventType.ATTEMPT_UPDATE_SAVED,
            new FinalStateSavedTransition())
 case APP_ATTEMPT_REMOVED:
      if (!(event instanceof AppAttemptRemovedSchedulerEvent)) {
        throw new RuntimeException("Unexpected event type: " + event);
      }
      AppAttemptRemovedSchedulerEvent appAttemptRemovedEvent =
          (AppAttemptRemovedSchedulerEvent) event;
      removeApplicationAttempt(
          appAttemptRemovedEvent.getApplicationAttemptID(),
          appAttemptRemovedEvent.getFinalAttemptState(),
          appAttemptRemovedEvent.getKeepContainersAcrossAppAttempts());
      break;
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值