执行命令 bin/spark-submit
其实是运行 org.apache.spark.deploy.SparkSubmit 类 ,类中doRunMain,Class.forName通过反射调用自定义类的main方法。
自定义main方法中构造了sparkContext,执行其主构造器。
1.调用方法createSparkEnv创建一个ActorSystem,这是akka用于通信的类
//TODO 该方法创建了一个ActorSystem
private[spark] def createSparkEnv(
conf: SparkConf,
isLocal: Boolean,
listenerBus: LiveListenerBus): SparkEnv = {
SparkEnv.createDriverEnv(conf, isLocal, listenerBus)
}
private[spark] val env = createSparkEnv(conf, isLocal, listenerBus)
SparkEnv.set(env)
// Create the ActorSystem for Akka and get the port it binds to.
val (actorSystem, boundPort) = {
val actorSystemName = if (isDriver) driverActorSystemName else executorActorSystemName
//TODO 利用AkkaUtils这个工具类创建ActorSystem
AkkaUtils.createActorSystem(actorSystemName, hostname, port, conf, securityManager)
}
2.创建TaskScheduler -> 根据提交任务的URL进行匹配(deploy model) -> TaskSchedulerImpl -> SparkDeploySchedulerBackend(里面有两个Actor------ClientActor,DriverActor)
//TODO 创建一个TaskScheduler
private[spark] var (schedulerBackend, taskScheduler) =
SparkContext.createTaskScheduler(this, master)
val SPARK_REGEX = """spark://(.*)""".r
//TODO spark的StandAlone模式,根据URL匹配的Spark://
case SPARK_REGEX(sparkUrl) =>
//TODO 创建了一个TaskSchedulerImpl
val scheduler = new TaskSchedulerImpl(sc)
val masterUrls = sparkUrl.split(",").map("spark://" + _)
//TODO 创建了一个SparkDeploySchedulerBackend
val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)
//TODO 调用initialize创建调度器
scheduler.initialize(backend)
(backend, scheduler)
3.通过ActorSystem创建了一个Actor,这个心跳是Executors和DriverActor的心跳
//TODO 通过ActorSystem创建了一个Actor,这个心跳是Executors和DriverActor的心跳
private val heartbeatReceiver = env.actorSystem.actorOf(
Props(new HeartbeatReceiver(taskScheduler)), "HeartbeatReceiver")
4.创建了一个DAGScheduler,以后用来把DAG切分成Stage
注:this 这里指的是SparkContext
//TODO 创建了一个DAGScheduler,以后用来把DAG切分成Stage
dagScheduler = new DAGScheduler(this)
5.启动taskScheduler
SparkContext:
//TODO 启动taskScheduler
taskScheduler.start()
TaskSchedulerImpl:
//TODO 首先掉用SparkDeploySchedulerBackend的start方法
backend.start()
SparkDeploySchedulerBackend:
//TODO 首先调用父类的start方法来创建DriverActor
super.start()
CoarseGrainedSchedulerBackend:
// TODO 通过ActorSystem创建DriverActor
driverActor = actorSystem.actorOf(
Props(new DriverActor(properties)), name = CoarseGrainedSchedulerBackend.ACTOR_NAME)
}
回到SparkDeploySchedulerBackend:
调用父类start()方法之后
//TODO 准备一些参数,以后把这些参数封装到一个对象中,然后将该对象发送给Master
val driverUrl = AkkaUtils.address(
AkkaUtils.protocol(actorSystem),
SparkEnv.driverActorSystemName,
conf.get("spark.driver.host"),
conf.get("spark.driver.port"),
CoarseGrainedSchedulerBackend.ACTOR_NAME)
//TODO 这个参数是以后Executor的实现类
val command = Command("org.apache.spark.executor.CoarseGrainedExecutorBackend",
args, sc.executorEnvs, classPathEntries ++ testingClassPath, libraryPathEntries, javaOpts)
val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")
//TODO 把参数封装到ApplicationDescription
val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,
appUIAddress, sc.eventLogDir, sc.eventLogCodec)
//TODO 创建一个AppClient把ApplicationDescription通过主构造器传进去
client = new AppClient(sc.env.actorSystem, masters, appDesc, this, conf)
//TODO 然后调用AppClient的start方法,在start方法中创建了一个ClientActor用于与Master通信
client.start()
waitForRegistration()
AppClient:
def start() {
// Just launch an actor; it will call back into the listener.
//TODO 创建ClientActor调用主构造器 -》preStart -》 receive
actor = actorSystem.actorOf(Props(new ClientActor))
}
//TODO ClientActor的生命周期方法
override def preStart() {
context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
try {
//TODO ClientActor向Master注册
registerWithMaster()
} catch {
case e: Exception =>
logWarning("Failed to connect to master", e)
markDisconnected()
context.stop(self)
}
}
向Master注册成功后Master会返回对象RegisteredApplication
//TODO Master发送给ClientActor注册成功的消息
case RegisteredApplication(appId_, masterUrl) =>
appId = appId_
registered = true
changeMaster(masterUrl)
listener.connected(appId)
Master:
//TODO ClientActor发送过来的注册应用的消息
case RegisterApplication(description) => {
if (state == RecoveryState.STANDBY) {
// ignore, don't send response
} else {
logInfo("Registering app " + description.name)
//TODO 首先把应用的信息放到内存中存储
val app = createApplication(description, sender)
registerApplication(app)
logInfo("Registered app " + description.name + " with ID " + app.id)
//TODO 利用持久化引擎保存
persistenceEngine.addApplication(app)
//TODO Master向ClientActor发送注册成功的消息
sender ! RegisteredApplication(app.id, masterUrl)
//TODO 重要:Master开始调度资源,其实就是把任务启动到哪些Worker上
schedule()
}
}
schedule()方法 (分配资源用,每个executor启动的核数内存等分配)
Master有两种调度方式:一种是尽量打散,另一种是尽量集中
//TODO Master发送消息让Worker启动Executor
launchExecutor(usableWorkers(pos), exec)
def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc) {
logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
//TODO 记录该Worker使用的资源
worker.addExecutor(exec)
//TODO Master发送消息给Worker,把参数通过case class传递给Worker,让他启动Executor,
worker.actor ! LaunchExecutor(masterUrl,
exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory)
//TODO Master向ClientActor发送消息,告诉它Executor已经启动了
exec.application.driver ! ExecutorAdded(
exec.id, worker.id, worker.hostPort, exec.cores, exec.memory)
}
Worker:
//TODO Master发送给Worker的消息,让Worker启动Executor。
//TODO LaunchExecutor是一个Case Class,里面封装了以后要启动的Executor的信息
case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) => ...........
//TODO 创建一个ExecutorRunner,将参数都放到其中,然后在通过他启动Executor
val manager = new ExecutorRunner(
appId,
execId,
appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
cores_,
memory_,
self,
workerId,
host,
webUi.boundPort,
publicAddress,
sparkHome,
executorDir,
akkaUrl,
conf,
appLocalDirs, ExecutorState.LOADING)
//TODO 把ExecutorID-> Executor放到一个Map中了
executors(appId + "/" + execId) = manager
//TODO 调用ExecutorRunner的start方法来启动Executor java子进程
manager.start()
coresUsed += cores_
memoryUsed += memory_
master ! ExecutorStateChanged(appId, execId, manager.state, None, None)
}
ExecutorRunner:
def start() {
//TODO 先创建一个线程对象,然后通过一个线程来启动一个Java子进程
workerThread = new Thread("ExecutorRunner for " + fullId) {
override def run() { fetchAndRunExecutor() }
}
//TODO 调用线程对象的start方法 -> 线程对象的run方法
workerThread.start()
// Shutdown hook that kills actors on shutdown.
shutdownHook = new Thread() {
override def run() {
killProcess(Some("Worker shutting down"))
}
}
Runtime.getRuntime.addShutdownHook(shutdownHook)
}
CoarseGrainedExecutorBackend:
//TODO CoarseGrainedExecutorBackend的生命周期方法
override def preStart() {
logInfo("Connecting to driver: " + driverUrl)
//TODO 跟Driver建立连接
driver = context.actorSelection(driverUrl)
//TODO Executor向DriverActor发送消息,来注册Exectuor
driver ! RegisterExecutor(executorId, hostPort, cores, extractLogUrls)
context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
}
DriverActor:
def receiveWithLogging = {
//TODO Executor向DriverActor发送的消息
case RegisterExecutor(executorId, hostPort, cores, logUrls) =>
Utils.checkHostPort(hostPort, "Host port expected " + hostPort)
if (executorDataMap.contains(executorId)) {
sender ! RegisterExecutorFailed("Duplicate executor ID: " + executorId)
} else {
logInfo("Registered executor: " + sender + " with ID " + executorId)
//TODO DriverActor向Executor发送消息,告诉Executor注册成功
sender ! RegisteredExecutor
/TODO 查看是否有任务需要提交(DriverActor -> Executor)
makeOffers()
def makeOffers() {
//TODO 调用launchTask向Executor提交Task
launchTasks(scheduler.resourceOffers(executorDataMap.map { case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
}.toSeq))
}
def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
//拿到序列化器
val ser = SparkEnv.get.closureSerializer.newInstance()
//TODO 序列化Task
val serializedTask = ser.serialize(task)
....................
//TODO 向Executor发送序列化好的Task
executorData.executorActor ! LaunchTask(new SerializableBuffer(serializedTask))
CoarseGrainedExecutorBackend:
//TODO DirverActor发送给Executor的消息,告诉它已经注册成功
case RegisteredExecutor =>
logInfo("Successfully registered with driver")
val (hostname, _) = Utils.parseHostPort(hostPort)
//TODO 创建了一个Executor实例,用来执行业务逻辑
executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)
//TODO 启动Executor子进程的入口
def main(args: Array[String]) { ........
//TODO 调用RUN方法
run(driverUrl, executorId, hostname, cores, appId, workerUrl, userClassPath)
private def run(
driverUrl: String,
executorId: String,
hostname: String,
cores: Int,
appId: String,
workerUrl: Option[String],
userClassPath: Seq[URL]) { .......
val (fetcher, _) = AkkaUtils.createActorSystem(
"driverPropsFetcher",
hostname,
port,
executorConf,
new SecurityManager(executorConf))
//TODO 跟Driver建立连接
val driver = fetcher.actorSelection(driverUrl)
//TODO CoarseGrainedExecutorBackend真正进行通信的Actor
env.actorSystem.actorOf(
Props(classOf[CoarseGrainedExecutorBackend],
driverUrl, executorId, sparkHostPort, cores, userClassPath, env),
name = "Executor")
workerUrl.foreach { url =>
env.actorSystem.actorOf(Props(classOf[WorkerWatcher], url), name = "WorkerWatcher")
}
env.actorSystem.awaitTermination()
CoarseGrainedExecutorBackend启动后等待DriverActor连接
/TODO DirverActor发送给Executor的消息,让Executor启动计算任务
case LaunchTask(data) =>
if (executor == null) {
logError("Received LaunchTask command but executor was null")
System.exit(1)
} else {
//获得序列化器
val ser = env.closureSerializer.newInstance()
//TODO 反序列化Task
val taskDesc = ser.deserialize[TaskDescription](data.value)
logInfo("Got assigned task " + taskDesc.taskId)
//TODO 将反序列化后的Task放到线程池里面
executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber,
taskDesc.name, taskDesc.serializedTask)
}
Executor:
//TODO 启动Task
def launchTask(
context: ExecutorBackend,
taskId: Long,
attemptNumber: Int,
taskName: String,
serializedTask: ByteBuffer) {
//TODO 创建一个TaskRunner对象,把Task的信息封装到TaskRunner里面
val tr = new TaskRunner(context, taskId = taskId, attemptNumber = attemptNumber, taskName,
serializedTask)
runningTasks.put(taskId, tr)
//TODO 把TaskRunner丢到线程池中
threadPool.execute(tr)
}
//TODO 执行Task真正的业务逻辑
override def run() {
......................................
//TODO 反序列化
task = ser.deserialize[Task[Any]](taskBytes, Thread.currentThread.getContextClassLoader)
..........................
//TODO 调用Task的run方法
val value = task.run(taskAttemptId = taskId, attemptNumber = attemptNumber)
Task:
def run(taskAttemptId: Long, attemptNumber: Int): T = {
.........................
//TODO 执行Task
runTask(context)