在完成stage划分之后,spark会以taskSet的方式从前往后提交task。我们来看下DAGScheduler中的submitMissingTasks()方法
/*stage是从后往前划分,task是从前往后执行*/
private def submitMissingTasks(stage: Stage, jobId: Int) {
val partitionsToCompute: Seq[Int] = stage.findMissingPartitions() // Returns the sequence of partition ids that are missing 获取需要计算的partition的id
val properties = jobIdToActiveJob(jobId).properties // 这个jobId是通过stage得到的,这里获取和这个stage相关的一些配置
runningStages += stage // tages we are running right now
// 重要!!!获取运行task的最佳位置,之前在处理job提交的时候把partition缓存位置都清除了
val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
stage match {
case s: ShuffleMapStage =>
partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
case s: ResultStage =>
partitionsToCompute.map { id =>
val p = s.partitions(id)
(id, getPreferredLocs(stage.rdd, p))
}.toMap
}
} catch {
}
stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
if (partitionsToCompute.nonEmpty) {
stage.latestInfo.submissionTime = Some(clock.getTimeMillis()) // 记录task提交时间
}
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
val tasks: Seq[Task[_]] = try { // 创建一组和partition数量一致的task
val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
stage match {
case stage: ShuffleMapStage =>
stage.pendingPartitions.clear()
partitionsToCompute.map { id =>
val locs = taskIdToLocations(id)
val part = partitions(id)
stage.pendingPartitions += id
// 每个partition都创建一个Task,如果是shuffleMapStage,就在task运行的最佳位置上创建ShuffleMapTask
new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId)
}
case stage: ResultStage =>
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = partitions(p)
val locs = taskIdToLocations(id)
// 每个partition都创建一个Task,如果是ResultStage ,就在task运行的最佳位置上创建resultTask
new ResultTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, id, properties, serializedTaskMetrics,
Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
}
}
} catch {
case NonFatal(e) =>
abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
runningStages -= stage
return
}
if (tasks.size > 0) {
// 重要!!!为每个stage创建一个taskset,通过taskSchedulerImpl提交一组task到executor上去执行
// 这里要回到taskSchedulerImpl()中去找到submitTasks()方法
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
} else {
markStageAsFinished(stage, None)
submitWaitingChildStages(stage)
}
}
接下来看看获取task最佳位置的getPreferredLocs()方法:
private def getPreferredLocsInternal(
rdd: RDD[_],
partition: Int,
visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation] = {
// 如果这个rdd的partition被遍历过了,就不需要再遍历了,直接返回一个Nil
if (!visited.add((rdd, partition))) {
return Nil
}
// 如果rdd的partition被缓存了,返回缓存位置信息
val cached = getCacheLocs(rdd)(partition)
if (cached.nonEmpty) {
return cached
}
// If the RDD has some placement preferences (as is the case for input RDDs), get those
// 如果RDD有设置一些首选项,主要是checkpoint,那么就返回这些rdd
val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
if (rddPrefs.nonEmpty) {
return rddPrefs.map(TaskLocation(_))
}
// 递归调用自己,寻找rdd的父rdd,到最开始的rdd,看看有没有哪个partition被缓存了或者checkpoint
rdd.dependencies.foreach {
case n: NarrowDependency[_] =>
for (inPart <- n.getParents(partition)) {
val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
if (locs != Nil) {
return locs
}
}
case _ =>
}
// 如果都没有就返回Nil
Nil
}
找到了运行task的最佳位置,接下来就要看看具体是怎么运行task。在初始化sparkContext的时候提到过,taskSchedulerImpl会通过initialize()方法创建一个task调度池,负责task调度顺序。找到taskSchedulerImpl.submitTasks()方法,具体路径是:core\src\main\scala\org\apache\spark\scheduler\TaskSchedulerImpl.scala
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
// 创建一个TaskSetManager,负责taskSet任务的执行情况监控和管理
val manager = createTaskSetManager(taskSet, maxTaskFailures)
val stage = taskSet.stageId
val stageTaskSets =
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
stageTaskSets(taskSet.stageAttemptId) = manager
val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
ts.taskSet != taskSet && !ts.isZombie
}
if (conflictingTaskSet) {
throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
}
// 这是sparkContext初始化的时候创建的task的调度池,默认FIFO。这里将TaskSetManager添加到调度池
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
}
hasReceivedTask = true
}
// 调用backend的reviveOffers()方法 找到backend的实现类CoarseGrainedSchedulerBackend
backend.reviveOffers()
}
看下TaskSetManager的作用:
/**
* Schedules the tasks within a single TaskSet in the TaskSchedulerImpl. This class keeps track of
* each task, retries tasks if they fail (up to a limited number of times), and
* handles locality-aware scheduling for this TaskSet via delay scheduling. The main interfaces
* to it are resourceOffer, which asks the TaskSet whether it wants to run a task on one node,
* and statusUpdate, which tells it that one of its tasks changed state (e.g. finished).
*
* 在TaskSchedulerImpl中对一个单独的TaskSet进行调度。这个类负责追踪一个task,重启失败的task直到最大次数,并且会通过延迟调度为这个task处理本地化机制
* 他的主要接口是 resourceOffer,在这个接口中,taskSet希望在一个节点中运行一个任务,并且接受任务状态改变的消息,来知道他负责的task的状态改变了。
*/
下面要找到bankend的实现类CoarseGrainedSchedulerBackend,路径为 core\src\main\scala\org\apache\spark\scheduler\cluster\CoarseGrainedSchedulerBackend.scala 在他的reviveOffers()方法中,直接调用了makeOffers()方法,该方法封装了公共调度的任务调度。我们直接进入该方法
// Make fake resource offers on all executors
private def makeOffers() {
// task分配算法,将task分配到不同的executor上去
val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized {
// Filter out executors under killing 过滤出有效的executor
val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
// 将可用的executor封装成workOffers,每个workOffers代表了每个executor可用的cpu资源
val workOffers = activeExecutors.map {
case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores,
Some(executorData.executorAddress.hostPort))
}.toIndexedSeq
scheduler.resourceOffers(workOffers)
}
if (!taskDescs.isEmpty) {
launchTasks(taskDescs)
}
}
makeOffers()方法小结:
1.调用taskSchedulerImpl的resourceOffers()方法,执行task分配算法,将task分配到不同的executor上去
2.分配好task到executor上之后,执行launchTasks()方法,想executor发送启动task消息,由executor启动task
下面来看task分配算法的核心方法resourceOffers()
/**
* Called by cluster manager to offer resources on slaves. We respond by asking our active task
* sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so
* that tasks are balanced across the cluster.
* 集群master向奴隶机分配资源,按照task请求的先后顺序来执行task,并将task以轮询的方式分配到每个节点上以便均衡集群资源
*/
// 接收的是executor封装后的WorkerOffer,返回的是task序列信息--每个task分配到哪个executor上去
def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
offers.filter { offer =>
!blacklistTracker.isNodeBlacklisted(offer.host) &&
!blacklistTracker.isExecutorBlacklisted(offer.executorId)
}
}.getOrElse(offers)
// 将可用的executor进行shuffle,以便可以做到负载均衡
val shuffledOffers = shuffleOffers(filteredOffers)
// Build a list of tasks to assign to each worker.
// tasks类似一个二维数组ArrayBuffer,元素又是一个固定长度的ArrayBuffer.CPUS_PER_TASK默认为1,即一个cpu执行一个task
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
// 可用cpu数组
val availableCpus = shuffledOffers.map(o => o.cores).toArray
// 任务总数
val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum
// 从taskscheduleimpl的调度池中取去排序后的taskset
val sortedTaskSets = rootPool.getSortedTaskSetQueue
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
taskSet.executorAdded()
}
}
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
/**
* 重要!!!
* taskset的本地化级别:1.PROCESS_LOCAL 进程本地化 相当于数据(partition)和task在同一个executor中
* NODE_LOCAL 节点本地化 数据和task在一个worker节点上
* NO_PREF 没有本地化级别
* RACK_LOCAL 机架本地化 数据和task在一个机架下
* ANY 任意的本地化级别
*/
// 双重for循环进行task的分配
// 第一层遍历所有排序的taskSet
for (taskSet <- sortedTaskSets) {
// Skip the barrier taskSet if the available slots are less than the number of pending tasks.
if (taskSet.isBarrier && availableSlots < taskSet.numTasks) {
} else {
var launchedAnyTask = false
// Record all the executor IDs assigned barrier tasks on.
val addressesWithDescs = ArrayBuffer[(String, TaskDescription)]()
// 第二层for循环遍历所有的本地化级别
for (currentMaxLocality <- taskSet.myLocalityLevels) {
var launchedTaskAtCurrentMaxLocality = false
do {
// 对当前的taskSet,优先使用最小的本地化级别,将taskSet中的task在executor上启动,如果启动不了,就跳出这个本地化级别
// 尝试进入下一个本地化级别,也就是放大一个级别启动task,以此类推,直到将所有的taskSet在某些本地化级别下,让task在executor上全部启动
launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(taskSet,
currentMaxLocality, shuffledOffers, availableCpus, tasks, addressesWithDescs)
launchedAnyTask |= launchedTaskAtCurrentMaxLocality
} while (launchedTaskAtCurrentMaxLocality)
}
if (!launchedAnyTask) {
taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
}
}
}
// TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get
// launched within a configured time.
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
task运行位置小结:
1.我们在 getPreferredLocsInternal()方法中获取到了task运行的最佳位置,即,task对应的partition数据是否被缓存被checkpoint以及缓存位置,这个位置就是task运行的最佳位置
2.为什么还有一个task本地化级别?比如我们为某个application分配了10个executor,分布在4个worker上,加入task对应的partition数据被缓存在worker1,executor02上,那么task当然是运行在worker1,executor02上最快,如果executor02不是为这个application服务的,那么task就运行在worker1上最快。让task启动在executor和partition尽量靠近的位置,这就是task本地化
下面来看是怎么判断task在某种本地化级别下能否在executor上启动的,也就是resourceOfferSingleTaskSet()。他返回一个boolean值
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager,
maxLocality: TaskLocality,
shuffledOffers: Seq[WorkerOffer],
availableCpus: Array[Int],
tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = {
var launchedTask = false
for (i <- 0 until shuffledOffers.size) { // 遍历所有的executor
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
if (availableCpus(i) >= CPUS_PER_TASK) { // 如果executor上可用的cpu数量大于每个task要使用的cpu数量,默认为1
try {
// 调用tasksetManger的resourceOffer()方法,找到在这个executor上,使用这种本地化级别,taskset的哪些task可以启动
// 遍历这些在当前本地化级别下可以在这个executor中启动的task
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
tasks(i) += task // 将这些task加入数组,给指定的executor启动这些task
val tid = task.taskId // 将task加入内存缓存
taskIdToTaskSetManager(tid) = taskSet
taskIdToExecutorId(tid) = execId
executorIdToRunningTaskIds(execId).add(tid)
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
// 将启动task标记为true
launchedTask = true
}
}
}
}
return launchedTask
}
接着看下taskSetManager的resourceOffer()方法。这个方法主要是两个作用:
1.判断这个task能不能在这种本地化级别下在这个executor上来启动。2.判断这个executor在这种本地化级别的等待时间是多长,如果在一定的时间内,就认为这个task可以在这种本地化级别下在这个executor上启动
def resourceOffer(
execId: String,
host: String,
maxLocality: TaskLocality.TaskLocality)
: Option[TaskDescription] =
{
val offerBlacklisted = taskSetBlacklistHelperOpt.exists { blacklist =>
blacklist.isNodeBlacklistedForTaskSet(host) ||
blacklist.isExecutorBlacklistedForTaskSet(execId)
}
if (!isZombie && !offerBlacklisted) {
val curTime = clock.getTimeMillis()
var allowedLocality = maxLocality
if (maxLocality != TaskLocality.NO_PREF) {
// 延时调度!!!
allowedLocality = getAllowedLocalityLevel(curTime)
if (allowedLocality > maxLocality) {
// We're not allowed to search for farther-away tasks
allowedLocality = maxLocality
}
}
dequeueTask(execId, host, allowedLocality).map { case ((index, taskLocality, speculative)) =>
// Found a task; do some bookkeeping and return a task description
val task = tasks(index)
val taskId = sched.newTaskId()
// Do various bookkeeping
copiesRunning(index) += 1
val attemptNum = taskAttempts(index).size
val info = new TaskInfo(taskId, index, attemptNum, curTime,
execId, host, taskLocality, speculative)
taskInfos(taskId) = info
taskAttempts(index) = info :: taskAttempts(index)
// Update our locality level for delay scheduling
// NO_PREF will not affect the variables related to delay scheduling
if (maxLocality != TaskLocality.NO_PREF) {
currentLocalityIndex = getLocalityIndex(taskLocality)
lastLaunchTime = curTime
}
// Serialize and return the task
val serializedTask: ByteBuffer = try {
ser.serialize(task)
}
addRunningTask(taskId)
sched.dagScheduler.taskStarted(task, info)
new TaskDescription(
taskId,
attemptNum,
execId,
taskName,
index,
addedFiles,
addedJars,
task.localProperties,
serializedTask)
}
} else {
None
}
}
延迟调度的核心方法getAllowedLocalityLevel()。(越来越复杂了。。。)
private def getAllowedLocalityLevel(curTime: Long): TaskLocality.TaskLocality = {
// Remove the scheduled or finished tasks lazily
def tasksNeedToBeScheduledFrom(pendingTaskIds: ArrayBuffer[Int]): Boolean = {
var indexOffset = pendingTaskIds.size
while (indexOffset > 0) {
indexOffset -= 1
val index = pendingTaskIds(indexOffset)
if (copiesRunning(index) == 0 && !successful(index)) {
return true
} else {
pendingTaskIds.remove(indexOffset)
}
}
false
}
// Walk through the list of tasks that can be scheduled at each location and returns true
// if there are any tasks that still need to be scheduled. Lazily cleans up tasks that have
// already been scheduled.
def moreTasksToRunIn(pendingTasks: HashMap[String, ArrayBuffer[Int]]): Boolean = {
val emptyKeys = new ArrayBuffer[String]
val hasTasks = pendingTasks.exists {
case (id: String, tasks: ArrayBuffer[Int]) =>
if (tasksNeedToBeScheduledFrom(tasks)) {
true
} else {
emptyKeys += id
false
}
}
// The key could be executorId, host or rackId
emptyKeys.foreach(id => pendingTasks.remove(id))
hasTasks
}
while (currentLocalityIndex < myLocalityLevels.length - 1) {
val moreTasks = myLocalityLevels(currentLocalityIndex) match {
case TaskLocality.PROCESS_LOCAL => moreTasksToRunIn(pendingTasksForExecutor)
case TaskLocality.NODE_LOCAL => moreTasksToRunIn(pendingTasksForHost)
case TaskLocality.NO_PREF => pendingTasksWithNoPrefs.nonEmpty
case TaskLocality.RACK_LOCAL => moreTasksToRunIn(pendingTasksForRack)
}
if (!moreTasks) {
// This is a performance optimization: if there are no more tasks that can
// be scheduled at a particular locality level, there is no point in waiting
// for the locality wait timeout (SPARK-4939).
lastLaunchTime = curTime
logDebug(s"No tasks for locality level ${myLocalityLevels(currentLocalityIndex)}, " +
s"so moving to locality level ${myLocalityLevels(currentLocalityIndex + 1)}")
currentLocalityIndex += 1
} else if (curTime - lastLaunchTime >= localityWaits(currentLocalityIndex)) {
// Jump to the next locality level, and reset lastLaunchTime so that the next locality
// wait timer doesn't immediately expire
lastLaunchTime += localityWaits(currentLocalityIndex)
logDebug(s"Moving to ${myLocalityLevels(currentLocalityIndex + 1)} after waiting for " +
s"${localityWaits(currentLocalityIndex)}ms")
currentLocalityIndex += 1
} else {
return myLocalityLevels(currentLocalityIndex)
}
}
myLocalityLevels(currentLocalityIndex)
}
task分配到不同的executor上之后,想executor发送启动task消息
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
// 对task进行序列化
val serializedTask = TaskDescription.encode(task)
val executorData = executorDataMap(task.executorId) // 找到对应的executor
executorData.freeCores -= scheduler.CPUS_PER_TASK // executor上减去task占用的资源
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask))) // 想executor发送LaunchTask消息,在executor上启动task
}
}