ResourceTrackerService.nodeHeartbeat
If the AM container fails abruptly, the nodemanger will report it to the resource manager through heart beat.
// 4. Send status to RMNode, saving the latest response.
this.rmContext.getDispatcher().getEventHandler().handle(
new RMNodeStatusEvent(nodeId, remoteNodeStatus.getNodeHealthStatus(),
remoteNodeStatus.getContainersStatuses(),
remoteNodeStatus.getKeepAliveApplications(), nodeHeartBeatResponse));
return nodeHeartBeatResponse;
#
RMNodeStatusEvent is processed at RMNodeImpl
.addTransition(NodeState.RUNNING,
EnumSet.of(NodeState.RUNNING, NodeState.UNHEALTHY),
RMNodeEventType.STATUS_UPDATE, new StatusUpdateWhenHealthyTransition())
StatusUpdateWhenHealthyTransition
rmNode.handleContainerStatus(statusEvent.getContainers());
if(rmNode.nextHeartBeat) {
rmNode.nextHeartBeat = false;
rmNode.context.getDispatcher().getEventHandler().handle(
new NodeUpdateSchedulerEvent(rmNode));
}
FairScheduler hanle of NodeUpdateSchedulerEvent
case NODE_UPDATE:
if (!(event instanceof NodeUpdateSchedulerEvent)) {
throw new RuntimeException("Unexpected event type: " + event);
}
NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
nodeUpdate(nodeUpdatedEvent.getRMNode());
break;
FairScheduler.nodeUpdate
/**
* Process a heartbeat update from a node.
*/
private synchronized void nodeUpdate(RMNode nm) {
long start = getClock().getTime();
if (LOG.isDebugEnabled()) {
LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource);
}
eventLog.log("HEARTBEAT", nm.getHostName());
FSSchedulerNode node = getFSSchedulerNode(nm.getNodeID());
List<UpdatedContainerInfo> containerInfoList = nm.pullContainerUpdates();
List<ContainerStatus> newlyLaunchedContainers = new ArrayList<ContainerStatus>();
List<ContainerStatus> completedContainers = new ArrayList<ContainerStatus>();
for(UpdatedContainerInfo containerInfo : containerInfoList) {
newlyLaunchedContainers.addAll(containerInfo.getNewlyLaunchedContainers());
completedContainers.addAll(containerInfo.getCompletedContainers());
}
// Processing the newly launched containers
for (ContainerStatus launchedContainer : newlyLaunchedContainers) {
containerLaunchedOnNode(launchedContainer.getContainerId(), node);
}
// Process completed containers
for (ContainerStatus completedContainer : completedContainers) {
ContainerId containerId = completedContainer.getContainerId();
LOG.debug("Container FINISHED: " + containerId);
completedContainer(getRMContainer(containerId),
completedContainer, RMContainerEventType.FINISHED);
}
if (continuousSchedulingEnabled) {
if (!completedContainers.isEmpty()) {
attemptScheduling(node);
}
} else {
attemptScheduling(node);
}
long duration = getClock().getTime() - start;
fsOpDurations.addNodeUpdateDuration(duration);
}
FairScheduler. completedContainer
/**
* Clean up a completed container.
*/
@Override
protected synchronized void completedContainer(RMContainer rmContainer,
ContainerStatus containerStatus, RMContainerEventType event) {
if (rmContainer == null) {
LOG.info("Null container completed...");
return;
}
Container container = rmContainer.getContainer();
// Get the application for the finished container
FSAppAttempt application =
getCurrentAttemptForContainer(container.getId());
ApplicationId appId =
container.getId().getApplicationAttemptId().getApplicationId();
if (application == null) {
LOG.info("Container " + container + " of" +
" unknown application attempt " + appId +
" completed with event " + event);
return;
}
// Get the node on which the container was allocated
FSSchedulerNode node = getFSSchedulerNode(container.getNodeId());
if (rmContainer.getState() == RMContainerState.RESERVED) {
application.unreserve(rmContainer.getReservedPriority(), node);
} else {
application.containerCompleted(rmContainer, containerStatus, event);
node.releaseContainer(container);
updateRootQueueMetrics();
}
LOG.info("Application attempt " + application.getApplicationAttemptId()
+ " released container " + container.getId() + " on node: " + node
+ " with event: " + event);
}
FsAttempt.containerCompleted
synchronized public void containerCompleted(RMContainer rmContainer,
ContainerStatus containerStatus, RMContainerEventType event) {
Container container = rmContainer.getContainer();
ContainerId containerId = container.getId();
// Remove from the list of newly allocated containers if found
newlyAllocatedContainers.remove(rmContainer);
// Inform the container
rmContainer.handle(
new RMContainerFinishedEvent(
containerId,
containerStatus,
event)
);
LOG.info("Completed container: " + rmContainer.getContainerId() +
" in state: " + rmContainer.getState() + " event:" + event);
// Remove from the list of containers
liveContainers.remove(rmContainer.getContainerId());
RMAuditLogger.logSuccess(getUser(),
AuditConstants.RELEASE_CONTAINER, "SchedulerApp",
getApplicationId(), containerId);
// Update usage metrics
Resource containerResource = rmContainer.getContainer().getResource();
queue.getMetrics().releaseResources(getUser(), 1, containerResource);
Resources.subtractFrom(currentConsumption, containerResource);
// remove from preemption map if it is completed
preemptionMap.remove(rmContainer);
// Clear resource utilization metrics cache.
lastMemoryAggregateAllocationUpdateTime = -1;
}
RMContainerImpl. FinishedTransition handles RMContainerFinishedEvent
@Override
public void transition(RMContainerImpl container, RMContainerEvent event) {
RMContainerFinishedEvent finishedEvent = (RMContainerFinishedEvent) event;
container.finishTime = System.currentTimeMillis();
container.finishedStatus = finishedEvent.getRemoteContainerStatus();
// Inform AppAttempt
// container.getContainer() can return null when a RMContainer is a
// reserved container
updateAttemptMetrics(container);
container.eventHandler.handle(new RMAppAttemptContainerFinishedEvent(
container.appAttemptId, finishedEvent.getRemoteContainerStatus(),
container.getAllocatedNode()));
container.rmContext.getRMApplicationHistoryWriter().containerFinished(
container);
boolean saveNonAMContainerMetaInfo =
container.rmContext.getYarnConfiguration().getBoolean(
YarnConfiguration
.APPLICATION_HISTORY_SAVE_NON_AM_CONTAINER_META_INFO,
YarnConfiguration
.DEFAULT_APPLICATION_HISTORY_SAVE_NON_AM_CONTAINER_META_INFO);
if (saveNonAMContainerMetaInfo || container.isAMContainer()) {
container.rmContext.getSystemMetricsPublisher().containerFinished(
container, container.finishTime);
}
}
RMAppAttemptImpl.transition handle
@Override
public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
RMAppAttemptEvent event) {
RMAppAttemptContainerFinishedEvent containerFinishedEvent =
(RMAppAttemptContainerFinishedEvent) event;
ContainerStatus containerStatus =
containerFinishedEvent.getContainerStatus();
// Is this container the AmContainer? If the finished container is same as
// the AMContainer, AppAttempt fails
if (appAttempt.masterContainer != null
&& appAttempt.masterContainer.getId().equals(
containerStatus.getContainerId())) {
appAttempt.sendAMContainerToNM(appAttempt, containerFinishedEvent);
// Remember the follow up transition and save the final attempt state.
appAttempt.rememberTargetTransitionsAndStoreState(event,
transitionToDo, RMAppAttemptState.FAILED, RMAppAttemptState.FAILED);
return RMAppAttemptState.FINAL_SAVING;
}
// Add all finished containers so that they can be acked to NM
addJustFinishedContainer(appAttempt, containerFinishedEvent);
return this.currentState;
}
}
RMAppAttemptImpl.rememberTargetTransitionsAndStoreState
private void rememberTargetTransitionsAndStoreState(RMAppAttemptEvent event,
Object transitionToDo, RMAppAttemptState targetFinalState,
RMAppAttemptState stateToBeStored) {
rememberTargetTransitions(event, transitionToDo, targetFinalState);
stateBeforeFinalSaving = getState();
// As of today, finalState, diagnostics, final-tracking-url and
// finalAppStatus are the only things that we store into the StateStore
// AFTER the initial saving on app-attempt-start
// These fields can be visible from outside only after they are saved in
// StateStore
String diags = null;
// don't leave the tracking URL pointing to a non-existent AM
if (conf.getBoolean(YarnConfiguration.APPLICATION_HISTORY_ENABLED,
YarnConfiguration.DEFAULT_APPLICATION_HISTORY_ENABLED)) {
setTrackingUrlToAHSPage(stateToBeStored);
} else {
setTrackingUrlToRMAppPage(stateToBeStored);
}
String finalTrackingUrl = getOriginalTrackingUrl();
FinalApplicationStatus finalStatus = null;
int exitStatus = ContainerExitStatus.INVALID;
switch (event.getType()) {
case LAUNCH_FAILED:
diags = event.getDiagnosticMsg();
break;
case REGISTERED:
diags = getUnexpectedAMRegisteredDiagnostics();
break;
case UNREGISTERED:
RMAppAttemptUnregistrationEvent unregisterEvent =
(RMAppAttemptUnregistrationEvent) event;
diags = unregisterEvent.getDiagnosticMsg();
// reset finalTrackingUrl to url sent by am
finalTrackingUrl = sanitizeTrackingUrl(unregisterEvent.getFinalTrackingUrl());
finalStatus = unregisterEvent.getFinalApplicationStatus();
break;
case CONTAINER_FINISHED:
RMAppAttemptContainerFinishedEvent finishEvent =
(RMAppAttemptContainerFinishedEvent) event;
diags = getAMContainerCrashedDiagnostics(finishEvent);
exitStatus = finishEvent.getContainerStatus().getExitStatus();
break;
case KILL:
break;
case EXPIRE:
diags = getAMExpiredDiagnostics(event);
break;
default:
break;
}
AggregateAppResourceUsage resUsage =
this.attemptMetrics.getAggregateAppResourceUsage();
RMStateStore rmStore = rmContext.getStateStore();
setFinishTime(System.currentTimeMillis());
ApplicationAttemptStateData attemptState =
ApplicationAttemptStateData.newInstance(
applicationAttemptId, getMasterContainer(),
rmStore.getCredentialsFromAppAttempt(this),
startTime, stateToBeStored, finalTrackingUrl, diags,
finalStatus, exitStatus,
getFinishTime(), resUsage.getMemorySeconds(),
resUsage.getVcoreSeconds());
LOG.info("Updating application attempt " + applicationAttemptId
+ " with final state: " + targetedFinalState + ", and exit status: "
+ exitStatus);
rmStore.updateApplicationAttemptState(attemptState);
}
RMStateStore
@SuppressWarnings("unchecked")
public void updateApplicationAttemptState(
ApplicationAttemptStateData attemptState) {
dispatcher.getEventHandler().handle(
new RMStateUpdateAppAttemptEvent(attemptState));
}
RMAppAttemptImpl.FINAL_SAVING state transition
.addTransition(RMAppAttemptState.FINAL_SAVING,
EnumSet.of(RMAppAttemptState.FINISHING, RMAppAttemptState.FAILED,
RMAppAttemptState.KILLED, RMAppAttemptState.FINISHED),
RMAppAttemptEventType.ATTEMPT_UPDATE_SAVED,
new FinalStateSavedTransition())
case APP_ATTEMPT_REMOVED:
if (!(event instanceof AppAttemptRemovedSchedulerEvent)) {
throw new RuntimeException("Unexpected event type: " + event);
}
AppAttemptRemovedSchedulerEvent appAttemptRemovedEvent =
(AppAttemptRemovedSchedulerEvent) event;
removeApplicationAttempt(
appAttemptRemovedEvent.getApplicationAttemptID(),
appAttemptRemovedEvent.getFinalAttemptState(),
appAttemptRemovedEvent.getKeepContainersAcrossAppAttempts());
break;