在执行完task分配算法之后,每个task就分配到对应的executor上了,下一步就是executor启动task。找到executor路径
core\src\main\scala\org\apache\spark\executor\Executor.scala 看下launchTask()方法
def launchTask(context: ExecutorBackend, taskDescription: TaskDescription): Unit = {
// 对每个task 都创建一个taskRunner taskRunner是一个线程,也就是每个task都是由一个单独的线程来执行的
val tr = new TaskRunner(context, taskDescription)
runningTasks.put(taskDescription.taskId, tr)
// executor内部有个java线程池,这里把task封装成了线程,丢入线程池,由线程来执行
// 由于线程池内部实现了排队机制,也就是说在执行task的时候,如果内部没有空闲线程,那么task都是要排队的
threadPool.execute(tr)
}
接着看下对task的封装,TaskRunner()方法
class TaskRunner(
execBackend: ExecutorBackend,
private val taskDescription: TaskDescription)
extends Runnable {
// taskrunner 继承了java的Runnable 接口,所以run()就是核心方法
override def run(): Unit = {
// 执行该task的线程的一些信息
threadId = Thread.currentThread.getId
Thread.currentThread.setName(threadName)
val threadMXBean = ManagementFactory.getThreadMXBean
val taskMemoryManager = new TaskMemoryManager(env.memoryManager, taskId)
val deserializeStartTime = System.currentTimeMillis()
val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime
} else 0L
Thread.currentThread.setContextClassLoader(replClassLoader)
val ser = env.closureSerializer.newInstance()
execBackend.statusUpdate(taskId, TaskState.RUNNING, EMPTY_BYTE_BUFFER)
var taskStart: Long = 0
var taskStartCpu: Long = 0
startGCTime = computeTotalGcTime()
try {
// 对序列化的task数据进行反序列化
Executor.taskDeserializationProps.set(taskDescription.properties)
// 将需要用的task资源 文件 jar包拷贝过来
updateDependencies(taskDescription.addedFiles, taskDescription.addedJars)
// 通过正式的反序列化操作,把整个task的数据集反序列化回来
task = ser.deserialize[Task[Any]](
taskDescription.serializedTask, Thread.currentThread.getContextClassLoader)
task.localProperties = taskDescription.properties
task.setTaskMemoryManager(taskMemoryManager)
taskStart = System.currentTimeMillis() // 计算出task开始的时间 webui页面会显示
taskStartCpu = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime
} else 0L
var threwException = true
val value = try {
// 使用task的run方法执行task
val res = task.run(
taskAttemptId = taskId,
attemptNumber = taskDescription.attemptNumber,
metricsSystem = env.metricsSystem)
threwException = false
// 这个res其实就是mapStatus,也就是封装了task执行的结果数据。
// 后面的task会联系mapOutPutTracker,拉取这个task的输出结果
res
}
val taskFinish = System.currentTimeMillis() // task 结束时间
// 对mapStutas进行序列化
val resultSer = env.serializer.newInstance()
val beforeSerialization = System.currentTimeMillis()
val valueBytes = resultSer.serialize(value)
val afterSerialization = System.currentTimeMillis()
// task执行完毕 这里又调用了executor所在的coarseGrainedExecutorBankend的statusUpdate方法
execBackend.statusUpdate(taskId, TaskState.FINISHED, serializedResult)
}
}