①driver的状态改变
case DriverStateChanged(driverId, state, exception) =>
state match {
// 如果driver的状态为错误、完成、杀掉、失败,就移除
case DriverState.ERROR | DriverState.FINISHED | DriverState.KILLED | DriverState.FAILED =>
removeDriver(driverId, state, exception)
case _ =>
throw new Exception(s"Received unexpected state update for driver $driverId: $state")
}
private def removeDriver(
driverId: String,
finalState: DriverState,
exception: Option[Exception]) {
// 使用高阶函数,找到 id对应的dirver
drivers.find(d => d.id == driverId) match {
case Some(driver) =>
logInfo(s"Removing driver: $driverId")
drivers -= driver // 将driver从内存转成中清除
if (completedDrivers.size >= RETAINED_DRIVERS) {
val toRemove = math.max(RETAINED_DRIVERS / 10, 1)
completedDrivers.trimStart(toRemove)
}
// 加入已完成driver中
completedDrivers += driver // 添加到已完成的dirver中
persistenceEngine.removeDriver(driver) // 从持久化引擎中删除没有连接成功的driver
driver.state = finalState
driver.exception = exception
driver.worker.foreach(w => w.removeDriver(driver)) // 从worker里移除driver
schedule()
case None =>
logWarning(s"Asked to remove unknown driver: $driverId")
}
}
②executor的状态改变
case ExecutorStateChanged(appId, execId, state, message, exitStatus) =>
// 找到executor对应的app,再通过app内部的executor缓存获取executor信息
val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId))
execOption match {
case Some(exec) =>
// 设置executor的当前状态
val appInfo = idToApp(appId)
val oldState = exec.state
exec.state = state
if (state == ExecutorState.RUNNING) {
assert(oldState == ExecutorState.LAUNCHING,
s"executor $execId state transfer from $oldState to RUNNING is illegal")
appInfo.resetRetryCount()
}
exec.application.driver.send(ExecutorUpdated(execId, state, message, exitStatus, false))
if (ExecutorState.isFinished(state)) {
// Remove this executor from the worker and app
logInfo(s"Removing executor ${exec.fullId} because it is $state")
// If an application has already finished, preserve its
// state to display its information properly on the UI
if (!appInfo.isFinished) {
appInfo.removeExecutor(exec) // 从app中移除executors
}
exec.worker.removeExecutor(exec) // 从worker中移除executor
val normalExit = exitStatus == Some(0)
// Only retry certain number of times so we don't go into an infinite loop.
// Important note: this code path is not exercised by tests, so be very careful when
// changing this `if` condition.
// 如果executor的退出状态是非正常的
if (!normalExit
&& appInfo.incrementRetryCount() >= MAX_EXECUTOR_RETRIES // 判断app当前的重试次数是否达到最大值
&& MAX_EXECUTOR_RETRIES >= 0) { // < 0 disables this application-killing path
val execs = appInfo.executors.values
if (!execs.exists(_.state == ExecutorState.RUNNING)) { //
logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +
s"${appInfo.retryCount} times; removing it")
removeApplication(appInfo, ApplicationState.FAILED)
}
}
}
schedule()
case None =>
logWarning(s"Got status update for unknown executor $appId/$execId")
}
def removeApplication(app: ApplicationInfo, state: ApplicationState.Value) {
// 将数据从内存缓存中移除
if (apps.contains(app)) {
logInfo("Removing app " + app.id)
apps -= app
idToApp -= app.id
endpointToApp -= app.driver
addressToApp -= app.driver.address
if (reverseProxy) {
webUi.removeProxyTargets(app.id)
}
if (completedApps.size >= RETAINED_APPLICATIONS) {
val toRemove = math.max(RETAINED_APPLICATIONS / 10, 1)
completedApps.take(toRemove).foreach { a =>
applicationMetricsSystem.removeSource(a.appSource)
}
completedApps.trimStart(toRemove)
}
completedApps += app // Remember it in our history
waitingApps -= app
// 移除executor
for (exec <- app.executors.values) {
killExecutor(exec)
}
app.markFinished(state)
if (state != ApplicationState.FINISHED) {
app.driver.send(ApplicationRemoved(state.toString)) // 对application的dirver发送消息
}
persistenceEngine.removeApplication(app) // 从持久化引擎中移除app
schedule()
// Tell all workers that the application has finished, so they can clean up any app state.
workers.foreach { w =>
w.endpoint.send(ApplicationFinished(app.id))
}
}
}