spark源码(六)--task提交算法

最新推荐文章于 2025-05-31 15:45:06 发布

山高水长~

最新推荐文章于 2025-05-31 15:45:06 发布

阅读量281

点赞数

分类专栏： spark

本文链接：https://blog.youkuaiyun.com/weixin_44024821/article/details/96158573

版权

spark 专栏收录该内容

16 篇文章

订阅专栏

在完成stage划分之后，spark会以taskSet的方式从前往后提交task。我们来看下DAGScheduler中的submitMissingTasks()方法

/*stage是从后往前划分，task是从前往后执行*/
	private def submitMissingTasks(stage: Stage, jobId: Int) {
	
	    val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()  // Returns the sequence of partition ids that are missing  获取需要计算的partition的id
	    val properties = jobIdToActiveJob(jobId).properties  // 这个jobId是通过stage得到的，这里获取和这个stage相关的一些配置
	    runningStages += stage   // tages we are running right now

	    // 重要！！！获取运行task的最佳位置，之前在处理job提交的时候把partition缓存位置都清除了
	    val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
	      stage match {
	        case s: ShuffleMapStage =>
	          partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
	        case s: ResultStage =>
	          partitionsToCompute.map { id =>
	            val p = s.partitions(id)
	            (id, getPreferredLocs(stage.rdd, p))
	          }.toMap
	      }
	    } catch {
	    }
	
	    stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)	
	    if (partitionsToCompute.nonEmpty) {
	      stage.latestInfo.submissionTime = Some(clock.getTimeMillis())  // 记录task提交时间
	    }
	    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

	    val tasks: Seq[Task[_]] = try {    //  创建一组和partition数量一致的task
	      val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
	      stage match {
	        case stage: ShuffleMapStage =>
	          stage.pendingPartitions.clear()
	          partitionsToCompute.map { id =>
	            val locs = taskIdToLocations(id)
	            val part = partitions(id)
	            stage.pendingPartitions += id
	            // 每个partition都创建一个Task，如果是shuffleMapStage,就在task运行的最佳位置上创建ShuffleMapTask
	            new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,  
	              taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
	              Option(sc.applicationId), sc.applicationAttemptId)
	          }
	        case stage: ResultStage =>
	          partitionsToCompute.map { id =>
	            val p: Int = stage.partitions(id)
	            val part = partitions(p)
	            val locs = taskIdToLocations(id)
	            // 每个partition都创建一个Task，如果是ResultStage ，就在task运行的最佳位置上创建resultTask
	            new ResultTask(stage.id, stage.latestInfo.attemptNumber,
	              taskBinary, part, locs, id, properties, serializedTaskMetrics,
	              Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
	          }
	      }
	    } catch {
	      case NonFatal(e) =>
	        abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
	        runningStages -= stage
	        return
	    }
	
	    if (tasks.size > 0) {
	      // 重要！！！为每个stage创建一个taskset，通过taskSchedulerImpl提交一组task到executor上去执行
	      // 这里要回到taskSchedulerImpl()中去找到submitTasks()方法
	      taskScheduler.submitTasks(new TaskSet(
	        tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
	    } else {
	      markStageAsFinished(stage, None)
	
	      submitWaitingChildStages(stage)
	    }
	  }

接下来看看获取task最佳位置的getPreferredLocs()方法：

private def getPreferredLocsInternal(
      rdd: RDD[_],
      partition: Int,
      visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation] = {
    // 如果这个rdd的partition被遍历过了，就不需要再遍历了，直接返回一个Nil
    if (!visited.add((rdd, partition))) {
      return Nil
    }
    // 如果rdd的partition被缓存了，返回缓存位置信息
    val cached = getCacheLocs(rdd)(partition)
    if (cached.nonEmpty) {
      return cached
    }
    // If the RDD has some placement preferences (as is the case for input RDDs), get those
	// 如果RDD有设置一些首选项,主要是checkpoint,那么就返回这些rdd
    val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
    if (rddPrefs.nonEmpty) {
      return rddPrefs.map(TaskLocation(_))
    }


	// 递归调用自己，寻找rdd的父rdd，到最开始的rdd,看看有没有哪个partition被缓存了或者checkpoint
    rdd.dependencies.foreach {
      case n: NarrowDependency[_] =>
        for (inPart <- n.getParents(partition)) {
          val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
          if (locs != Nil) {
            return locs
          }
        }

      case _ =>
    }
	// 如果都没有就返回Nil
    Nil
  }

找到了运行task的最佳位置，接下来就要看看具体是怎么运行task。在初始化sparkContext的时候提到过，taskSchedulerImpl会通过initialize()方法创建一个task调度池，负责task调度顺序。找到taskSchedulerImpl.submitTasks()方法，具体路径是：core\src\main\scala\org\apache\spark\scheduler\TaskSchedulerImpl.scala

override def submitTasks(taskSet: TaskSet) {
    val tasks = taskSet.tasks
    logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
    this.synchronized {
	  // 创建一个TaskSetManager，负责taskSet任务的执行情况监控和管理
      val manager = createTaskSetManager(taskSet, maxTaskFailures)
      val stage = taskSet.stageId
      val stageTaskSets =
        taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
      stageTaskSets(taskSet.stageAttemptId) = manager
      val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
        ts.taskSet != taskSet && !ts.isZombie
      }
      if (conflictingTaskSet) {
        throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
          s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
      }
	  
      // 这是sparkContext初始化的时候创建的task的调度池，默认FIFO。这里将TaskSetManager添加到调度池
      schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)

      if (!isLocal && !hasReceivedTask) {
        starvationTimer.scheduleAtFixedRate(new TimerTask() {
          override def run() {
            if (!hasLaunchedTask) {
              logWarning("Initial job has not accepted any resources; " +
                "check your cluster UI to ensure that workers are registered " +
                "and have sufficient resources")
            } else {
              this.cancel()
            }
          }
        }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
      }
      hasReceivedTask = true
    }
    // 调用backend的reviveOffers()方法 找到backend的实现类CoarseGrainedSchedulerBackend
    backend.reviveOffers()
  }

看下TaskSetManager的作用：

/**
 * Schedules the tasks within a single TaskSet in the TaskSchedulerImpl. This class keeps track of
 * each task, retries tasks if they fail (up to a limited number of times), and
 * handles locality-aware scheduling for this TaskSet via delay scheduling. The main interfaces
 * to it are resourceOffer, which asks the TaskSet whether it wants to run a task on one node,
 * and statusUpdate, which tells it that one of its tasks changed state (e.g. finished).
 *
 * 在TaskSchedulerImpl中对一个单独的TaskSet进行调度。这个类负责追踪一个task，重启失败的task直到最大次数，并且会通过延迟调度为这个task处理本地化机制
 * 他的主要接口是 resourceOffer，在这个接口中，taskSet希望在一个节点中运行一个任务，并且接受任务状态改变的消息，来知道他负责的task的状态改变了。
 */

下面要找到bankend的实现类CoarseGrainedSchedulerBackend，路径为 core\src\main\scala\org\apache\spark\scheduler\cluster\CoarseGrainedSchedulerBackend.scala 在他的reviveOffers()方法中，直接调用了makeOffers()方法，该方法封装了公共调度的任务调度。我们直接进入该方法

// Make fake resource offers on all executors
private def makeOffers() {
  // task分配算法，将task分配到不同的executor上去
  val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized {
    // Filter out executors under killing  过滤出有效的executor
    val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
	// 将可用的executor封装成workOffers，每个workOffers代表了每个executor可用的cpu资源
    val workOffers = activeExecutors.map {
      case (id, executorData) =>
        new WorkerOffer(id, executorData.executorHost, executorData.freeCores,
          Some(executorData.executorAddress.hostPort))
    }.toIndexedSeq
    scheduler.resourceOffers(workOffers)
  }
  
  if (!taskDescs.isEmpty) {
    launchTasks(taskDescs)
  }
}

makeOffers()方法小结：
1.调用taskSchedulerImpl的resourceOffers()方法，执行task分配算法，将task分配到不同的executor上去
2.分配好task到executor上之后，执行launchTasks()方法，想executor发送启动task消息，由executor启动task

下面来看task分配算法的核心方法resourceOffers()

/**
   * Called by cluster manager to offer resources on slaves. We respond by asking our active task
   * sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so
   * that tasks are balanced across the cluster.
   * 集群master向奴隶机分配资源，按照task请求的先后顺序来执行task，并将task以轮询的方式分配到每个节点上以便均衡集群资源
   */
   // 接收的是executor封装后的WorkerOffer，返回的是task序列信息--每个task分配到哪个executor上去
  def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
   
    val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
      offers.filter { offer =>
        !blacklistTracker.isNodeBlacklisted(offer.host) &&
          !blacklistTracker.isExecutorBlacklisted(offer.executorId)
      }
    }.getOrElse(offers)

	// 将可用的executor进行shuffle，以便可以做到负载均衡
    val shuffledOffers = shuffleOffers(filteredOffers)
    // Build a list of tasks to assign to each worker.
	// tasks类似一个二维数组ArrayBuffer，元素又是一个固定长度的ArrayBuffer.CPUS_PER_TASK默认为1，即一个cpu执行一个task
    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
	// 可用cpu数组
    val availableCpus = shuffledOffers.map(o => o.cores).toArray
	// 任务总数
    val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum
	// 从taskscheduleimpl的调度池中取去排序后的taskset
    val sortedTaskSets = rootPool.getSortedTaskSetQueue
    for (taskSet <- sortedTaskSets) {
      logDebug("parentName: %s, name: %s, runningTasks: %s".format(
        taskSet.parent.name, taskSet.name, taskSet.runningTasks))
      if (newExecAvail) {
        taskSet.executorAdded()
      }
    }

    // Take each TaskSet in our scheduling order, and then offer it each node in increasing order
    // of locality levels so that it gets a chance to launch local tasks on all of them.
    // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
	/**
	* 重要！！！ 
	* taskset的本地化级别：1.PROCESS_LOCAL 进程本地化 相当于数据（partition）和task在同一个executor中
    * NODE_LOCAL 节点本地化 数据和task在一个worker节点上
    * NO_PREF 没有本地化级别
    * RACK_LOCAL  机架本地化  数据和task在一个机架下
    * ANY 任意的本地化级别
	*/
	// 双重for循环进行task的分配
	// 第一层遍历所有排序的taskSet
    for (taskSet <- sortedTaskSets) {
      // Skip the barrier taskSet if the available slots are less than the number of pending tasks.
      if (taskSet.isBarrier && availableSlots < taskSet.numTasks) {
      } else {
        var launchedAnyTask = false
        // Record all the executor IDs assigned barrier tasks on.
        val addressesWithDescs = ArrayBuffer[(String, TaskDescription)]()
        // 第二层for循环遍历所有的本地化级别
        for (currentMaxLocality <- taskSet.myLocalityLevels) {
          var launchedTaskAtCurrentMaxLocality = false
          do {
            // 对当前的taskSet，优先使用最小的本地化级别，将taskSet中的task在executor上启动，如果启动不了，就跳出这个本地化级别
            // 尝试进入下一个本地化级别，也就是放大一个级别启动task,以此类推，直到将所有的taskSet在某些本地化级别下，让task在executor上全部启动 
            launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(taskSet,
              currentMaxLocality, shuffledOffers, availableCpus, tasks, addressesWithDescs)
            launchedAnyTask |= launchedTaskAtCurrentMaxLocality
          } while (launchedTaskAtCurrentMaxLocality)
        }
        if (!launchedAnyTask) {
          taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
        }
      }
    }

    // TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get
    // launched within a configured time.
    if (tasks.size > 0) {
      hasLaunchedTask = true
    }
    return tasks
  }

task运行位置小结：
1.我们在 getPreferredLocsInternal()方法中获取到了task运行的最佳位置，即，task对应的partition数据是否被缓存被checkpoint以及缓存位置，这个位置就是task运行的最佳位置
2.为什么还有一个task本地化级别？比如我们为某个application分配了10个executor，分布在4个worker上，加入task对应的partition数据被缓存在worker1，executor02上，那么task当然是运行在worker1，executor02上最快，如果executor02不是为这个application服务的，那么task就运行在worker1上最快。让task启动在executor和partition尽量靠近的位置，这就是task本地化

下面来看是怎么判断task在某种本地化级别下能否在executor上启动的，也就是resourceOfferSingleTaskSet()。他返回一个boolean值

private def resourceOfferSingleTaskSet(
      taskSet: TaskSetManager,
      maxLocality: TaskLocality,
      shuffledOffers: Seq[WorkerOffer],
      availableCpus: Array[Int],
      tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = {
    var launchedTask = false
    for (i <- 0 until shuffledOffers.size) {  // 遍历所有的executor
      val execId = shuffledOffers(i).executorId
      val host = shuffledOffers(i).host
      if (availableCpus(i) >= CPUS_PER_TASK) { // 如果executor上可用的cpu数量大于每个task要使用的cpu数量，默认为1
        try {
          // 调用tasksetManger的resourceOffer()方法，找到在这个executor上，使用这种本地化级别，taskset的哪些task可以启动
          // 遍历这些在当前本地化级别下可以在这个executor中启动的task
          for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
            tasks(i) += task   // 将这些task加入数组，给指定的executor启动这些task
            val tid = task.taskId   // 将task加入内存缓存
            taskIdToTaskSetManager(tid) = taskSet
            taskIdToExecutorId(tid) = execId
            executorIdToRunningTaskIds(execId).add(tid)
            availableCpus(i) -= CPUS_PER_TASK
            assert(availableCpus(i) >= 0)
              // 将启动task标记为true
              launchedTask = true
          }
        } 
      }
    }
    return launchedTask
  }

接着看下taskSetManager的resourceOffer()方法。这个方法主要是两个作用：
1.判断这个task能不能在这种本地化级别下在这个executor上来启动。2.判断这个executor在这种本地化级别的等待时间是多长，如果在一定的时间内，就认为这个task可以在这种本地化级别下在这个executor上启动

def resourceOffer(
      execId: String,
      host: String,
      maxLocality: TaskLocality.TaskLocality)
    : Option[TaskDescription] =
  {
    val offerBlacklisted = taskSetBlacklistHelperOpt.exists { blacklist =>
      blacklist.isNodeBlacklistedForTaskSet(host) ||
        blacklist.isExecutorBlacklistedForTaskSet(execId)
    }
    if (!isZombie && !offerBlacklisted) {
      val curTime = clock.getTimeMillis()

      var allowedLocality = maxLocality

      if (maxLocality != TaskLocality.NO_PREF) {
        // 延时调度！！！
        allowedLocality = getAllowedLocalityLevel(curTime)
        if (allowedLocality > maxLocality) {
          // We're not allowed to search for farther-away tasks
          allowedLocality = maxLocality
        }
      }

      dequeueTask(execId, host, allowedLocality).map { case ((index, taskLocality, speculative)) =>
        // Found a task; do some bookkeeping and return a task description
        val task = tasks(index)
        val taskId = sched.newTaskId()
        // Do various bookkeeping
        copiesRunning(index) += 1
        val attemptNum = taskAttempts(index).size
        val info = new TaskInfo(taskId, index, attemptNum, curTime,
          execId, host, taskLocality, speculative)
        taskInfos(taskId) = info
        taskAttempts(index) = info :: taskAttempts(index)
        // Update our locality level for delay scheduling
        // NO_PREF will not affect the variables related to delay scheduling
        if (maxLocality != TaskLocality.NO_PREF) {
          currentLocalityIndex = getLocalityIndex(taskLocality)
          lastLaunchTime = curTime
        }
        // Serialize and return the task
        val serializedTask: ByteBuffer = try {
          ser.serialize(task)
        }
        addRunningTask(taskId)
        
        sched.dagScheduler.taskStarted(task, info)
        new TaskDescription(
          taskId,
          attemptNum,
          execId,
          taskName,
          index,
          addedFiles,
          addedJars,
          task.localProperties,
          serializedTask)
      }
    } else {
      None
    }
  }
延迟调度的核心方法getAllowedLocalityLevel()。（越来越复杂了。。。）

  private def getAllowedLocalityLevel(curTime: Long): TaskLocality.TaskLocality = {
    // Remove the scheduled or finished tasks lazily
    def tasksNeedToBeScheduledFrom(pendingTaskIds: ArrayBuffer[Int]): Boolean = {
      var indexOffset = pendingTaskIds.size
      while (indexOffset > 0) {
        indexOffset -= 1
        val index = pendingTaskIds(indexOffset)
        if (copiesRunning(index) == 0 && !successful(index)) {
          return true
        } else {
          pendingTaskIds.remove(indexOffset)
        }
      }
      false
    }
    // Walk through the list of tasks that can be scheduled at each location and returns true
    // if there are any tasks that still need to be scheduled. Lazily cleans up tasks that have
    // already been scheduled.
    def moreTasksToRunIn(pendingTasks: HashMap[String, ArrayBuffer[Int]]): Boolean = {
      val emptyKeys = new ArrayBuffer[String]
      val hasTasks = pendingTasks.exists {
        case (id: String, tasks: ArrayBuffer[Int]) =>
          if (tasksNeedToBeScheduledFrom(tasks)) {
            true
          } else {
            emptyKeys += id
            false
          }
      }
      // The key could be executorId, host or rackId
      emptyKeys.foreach(id => pendingTasks.remove(id))
      hasTasks
    }

    while (currentLocalityIndex < myLocalityLevels.length - 1) {
      val moreTasks = myLocalityLevels(currentLocalityIndex) match {
        case TaskLocality.PROCESS_LOCAL => moreTasksToRunIn(pendingTasksForExecutor)
        case TaskLocality.NODE_LOCAL => moreTasksToRunIn(pendingTasksForHost)
        case TaskLocality.NO_PREF => pendingTasksWithNoPrefs.nonEmpty
        case TaskLocality.RACK_LOCAL => moreTasksToRunIn(pendingTasksForRack)
      }
      if (!moreTasks) {
        // This is a performance optimization: if there are no more tasks that can
        // be scheduled at a particular locality level, there is no point in waiting
        // for the locality wait timeout (SPARK-4939).
        lastLaunchTime = curTime
        logDebug(s"No tasks for locality level ${myLocalityLevels(currentLocalityIndex)}, " +
          s"so moving to locality level ${myLocalityLevels(currentLocalityIndex + 1)}")
        currentLocalityIndex += 1
      } else if (curTime - lastLaunchTime >= localityWaits(currentLocalityIndex)) {
        // Jump to the next locality level, and reset lastLaunchTime so that the next locality
        // wait timer doesn't immediately expire
        lastLaunchTime += localityWaits(currentLocalityIndex)
        logDebug(s"Moving to ${myLocalityLevels(currentLocalityIndex + 1)} after waiting for " +
          s"${localityWaits(currentLocalityIndex)}ms")
        currentLocalityIndex += 1
      } else {
        return myLocalityLevels(currentLocalityIndex)
      }
    }
    myLocalityLevels(currentLocalityIndex)
  }

task分配到不同的executor上之后，想executor发送启动task消息

private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
  for (task <- tasks.flatten) {
    // 对task进行序列化 
	val serializedTask = TaskDescription.encode(task)     
	  val executorData = executorDataMap(task.executorId) // 找到对应的executor
	  executorData.freeCores -= scheduler.CPUS_PER_TASK   // executor上减去task占用的资源
	  executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))  // 想executor发送LaunchTask消息，在executor上启动task
  }
}