这里以count操作为例,一步步解析Spark在执行一个Job时如何进行DAG图的解析。Spark在遇到Action类型算子时,会使用SparkContext进行一系列的runJob方法调用,最终会调用DAGScheduler的runJob方法来划分DAG图。
一、runJob方法调用
// 计算RDD中包含的键值对个数,此时会触发一个SparkContext来提交执行Job
def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum
def runJob[T, U: ClassTag](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
runJob(rdd, func, 0 until rdd.partitions.length)
}
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: Iterator[T] => U,
partitions: Seq[Int]): Array[U] = {
val cleanedFunc = clean(func)
runJob(rdd, (ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it), partitions)
}
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int]): Array[U] = {
val results = new Array[U](partitions.size)
runJob[T, U](rdd, func, partitions, (index, res) => results(index) = res)
results
}
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
resultHandler: (Int, U) => Unit): Unit = {
if (stopped.get()) {
throw new IllegalStateException("SparkContext has been shutdown")
}
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
// 调用DAGScheduler的runJob进行处理,负责任务的逻辑调度,将Job拆分成不同阶段的具有依赖关系的任务集
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
// 调用rdd的doCheckPoint方法来缓存RDD数据,会以一个额外的Job来执行
rdd.doCheckpoint()
}
二、DAGScheduler中进行DAG划分
1、runJob方法,该方法主要使用DAGScheduler来对Job进行逻辑提交执行。
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.nanoTime
// DAGScheduler提交该Job,它会等待作业提交的结果。此时提交任务的线程会在这里阻塞直至返回Job
// 执行结果。然后判断一下成功或者是失败来进行下一步操作。
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait]
waiter.completionFuture.ready(Duration.Inf)(awaitPermission)
waiter.completionFuture.value.get match {
case scala.util.Success(_) =>
logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
case scala.util.Failure(exception) =>