Sparkstreaming之JobScheduler

最新推荐文章于 2021-01-14 10:49:35 发布

原创最新推荐文章于 2021-01-14 10:49:35 发布 · 403 阅读

0 ·

CC 4.0 BY-SA版权

大数据专栏收录该内容

8 篇文章

订阅专栏

本文深入探讨Spark Streaming中JobScheduler的工作原理，包括事件处理、作业开始与完成的管理、错误报告处理，以及如何通过JobScheduler启动和管理作业执行。文章还详细介绍了JobScheduler中的关键方法如processEvent、handleJobStart、handleJobCompletion等，并解释了如何动态调整executor的数量。

1.processEvent方法

//接口继承关系
private[scheduler] sealed trait JobSchedulerEvent
private[scheduler] case class JobStarted(job: Job, startTime: Long) extends JobSchedulerEvent
private[scheduler] case class JobCompleted(job: Job, completedTime: Long) extends JobSchedulerEvent
private[scheduler] case class ErrorReported(msg: String, e: Throwable) extends JobSchedulerEvent

private def processEvent(event: JobSchedulerEvent) {
  try {
    event match {
//开始监听作业时间，统计开始时间
      case JobStarted(job, startTime) => handleJobStart(job, startTime)
//结束时间信息统计
      case JobCompleted(job, completedTime) => handleJobCompletion(job, completedTime)
//持有的loack全部采用condition.signalAll方式进行提交
      case ErrorReported(m, e) => handleError(m, e)
    }
  } catch {
    case e: Throwable =>
      reportError("Error in job scheduler", e)
  }
}

1.1 handleJobStart方法

private def handleJobStart(job: Job, startTime: Long) {
//设置hash表值
  val jobSet = jobSets.get(job.time)
//判断开始时间是否是set的第一个，如果小于0就是第一个
  val isFirstJobOfJobSet = !jobSet.hasStarted
//开始设置开始时间
  jobSet.handleJobStart(job)
//是第一个
  if (isFirstJobOfJobSet) {
   //将事件放置到阻塞队里中
    listenerBus.post(StreamingListenerBatchStarted(jobSet.toBatchInfo))
  }
//设置开始时间
  job.setStartTime(startTime)
  listenerBus.post(StreamingListenerOutputOperationStarted(job.toOutputOperationInfo))
  logInfo("Starting job " + job.id + " from job set of time " + jobSet.time)
}
1.2 handleJobCompletion方法
private def handleJobCompletion(job: Job, completedTime: Long) {
  val jobSet = jobSets.get(job.time)
  jobSet.handleJobCompletion(job)
  job.setEndTime(completedTime)
  listenerBus.post(StreamingListenerOutputOperationCompleted(job.toOutputOperationInfo))
  logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
  if (jobSet.hasCompleted) {
    listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo))
  }
  job.result match {
    case Failure(e) =>
      reportError("Error running job " + job, e)
    case _ =>
      if (jobSet.hasCompleted) {
//jobset没有元素了移除jobset的时间
        jobSets.remove(jobSet.time)
//设置clearmetadata时间
        jobGenerator.onBatchCompletion(jobSet.time)
//后面就是时间计算。
        logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(
          jobSet.totalDelay / 1000.0, jobSet.time.toString,
          jobSet.processingDelay / 1000.0
        ))
      }
  }

2. start方法

def start(): Unit = synchronized {
  if (eventLoop != null) return // scheduler has already been started

  logDebug("Starting JobScheduler")
//这行代码很巧妙，EventLoop是一个抽象类，new对象的时候首先是初始化变量，会初始化一个EventLoop的线程而这个线程，而这个线程又不停的从阻塞队里中取元素，然后调用子类的onReceive方法。
  eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
    override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)

    override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
  }
//后台监视事件，前台来开始线程。这个函数注意有一个onStart方法主要用来保证onReceive在开始之后才跑，因为这个事件队里是阻塞式的事件队列，当没有元素的时候是不会开始循环调onReceive方法的。然后真正将后台的线程开始启动起来。
  eventLoop.start()

  // attach rate controllers of input streams to receive batch completion updates
  for {
//输入流
    inputDStream <- ssc.graph.getInputStreams
//跟踪消费速度    
rateController <- inputDStream.rateController
  } 
  //放到一个copyonwrite的集合里，为啥？当然是实时可以看到【准确】图像啦
 ssc.addStreamingListener(rateController)

  listenerBus.start()
//初始化:
  receiverTracker = new ReceiverTracker(ssc)
  inputInfoTracker = new InputInfoTracker(ssc)

  val executorAllocClient: ExecutorAllocationClient = ssc.sparkContext.schedulerBackend match {
    case b: ExecutorAllocationClient => b.asInstanceOf[ExecutorAllocationClient]
    case _ => null
  }
//主要用于管理分配给StreamingContext的executor,动态请求，杀死执行器。可收缩可扩展，不错啊。val averageBatchProcTime = //batchProcTimeSum / batchProcTimeCount
//val ratio = averageBatchProcTime.toDouble / batchDurationMs
//if (ratio >= scalingUpRatio) {
//  logDebug("Requesting executors")
//  val numNewExecutors = math.max(math.round(ratio).toInt, 1)
/**这种情况EXCUTOR会被kill掉**/
//if (ratio <= scalingDownRatio) {
//  logDebug("Killing executors")
//  killExecutor()
//}
//怎么调？
//private val scalingUpRatio = conf.getDouble(SCALING_UP_RATIO_KEY, SCALING_UP_RATIO_DEFAULT)
//SCALING_UP_RATIO_KEY的值是啥？
//   SCALING_UP_RATIO_KEY =“spark.streaming.dynamicAllocation.scalingUpRatio”的值
//    val SCALING_UP_RATIO_DEFAULT = 0.9 默认的值
//private val scalingDownRatio = conf.getDouble(SCALING_DOWN_RATIO_KEY, SCALING_DOWN_RATIO_DEFAULT)
// SCALING_DOWN_RATIO_KEY ="spark.streaming.dynamicAllocation.scalingDownRatio"的值
//   val SCALING_DOWN_RATIO_DEFAULT = 0.3 
//private val minNumExecutors = conf.getInt(MIN_EXECUTORS_KEY,math.max(1, receiverTracker.numReceivers（这个值//是receiverInputStreams数组的大小）))
//   val MIN_EXECUTORS_KEY = "spark.streaming.dynamicAllocation.minExecutors"    
//private val maxNumExecutors = conf.getInt(MAX_EXECUTORS_KEY, Integer.MAX_VALUE)
//  val MAX_EXECUTORS_KEY = "spark.streaming.dynamicAllocation.maxExecutors"   
//private val timer = new RecurringTimer(clock, scalingIntervalSecs * 1000,
//  _ => manageAllocation(), "streaming-executor-allocation-manager")
//设置周期多少时间去调用manageAllocation这个修改excutor的函数
 //val SCALING_INTERVAL_KEY = "spark.streaming.dynamicAllocation.scalingInterval"
//  val SCALING_INTERVAL_DEFAULT_SECS = 60
//如果你要动态分配val ENABLED_KEY = "spark.streaming.dynamicAllocation.enabled"这个设置为true并且实例个数不要设置
//    val numExecutor = conf.getInt("spark.executor.instances", 0)
//    val streamingDynamicAllocationEnabled = conf.getBoolean(ENABLED_KEY, false)
//    if (numExecutor != 0 && streamingDynamicAllocationEnabled) {


executorAllocationManager = ExecutorAllocationManager.createIfEnabled(
    executorAllocClient,
    receiverTracker,
    ssc.conf,
    ssc.graph.batchDuration.milliseconds,
    clock)
  executorAllocationManager.foreach(ssc.addStreamingListener)
// endpoint = ssc.env.rpcEnv.setupEndpoint(
//     "ReceiverTracker", new ReceiverTrackerEndpoint(ssc.env.rpcEnv)) 
//if (!skipReceiverLaunch) launchReceivers()分发到各个节点上，怎么发的下次再写 
  receiverTracker.start()
//同样道理EventLoop
  jobGenerator.start()
  executorAllocationManager.foreach(_.start())//这里的start方法如果配置了动态的就动态调excutor数量了
  logInfo("Started JobScheduler")
}

3.submitJobSet方法

// listenerBus.post(StreamingListenerBatchSubmitted(jobSet.toBatchInfo))
// jobSets.put(jobSet.time, jobSet)
// jobSet.jobs.foreach(job => jobExecutor.execute(new JobHandler(job)))
// logInfo("Added jobs for time " + jobSet.time)

jobExecutor其实就是一个线程池而已

//private val jobExecutor =ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs, "streaming-job-executor")

// private val numConcurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1)默认作业数量是1

4.JobHandler这个线程

private class JobHandler(job: Job) extends Runnable with Logging {
import JobScheduler._

def run() {

//获取配置信息
val oldProps = ssc.sparkContext.getLocalProperties
try {
ssc.sparkContext.setLocalProperties(SerializationUtils.clone(ssc.savedProperties.get()))
val formattedTime = UIUtils.formatBatchTime(
job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false)
val batchUrl = s"/streaming/batch/?id=${job.time.milliseconds}"
val batchLinkText = s"[output operation ${job.outputOpId}, batch time ${formattedTime}]"

ssc.sc.setJobDescription(
s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")

val BATCH_TIME_PROPERTY_KEY = "spark.streaming.internal.batchTime"
val OUTPUT_OP_ID_PROPERTY_KEY = "spark.streaming.internal.outputOpId"
ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)
//检查checkpoint血缘关系被截断。
ssc.sparkContext.setLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS, "true")

var _eventLoop = eventLoop
if (_eventLoop != null) {

//监听到事件准备开始啦
_eventLoop.post(JobStarted(job, clock.getTimeMillis()))
PairRDDFunctions.disableOutputSpecValidation.withValue(true) {

//开始啦
job.run()
}
_eventLoop = eventLoop
if (_eventLoop != null) {
_eventLoop.post(JobCompleted(job, clock.getTimeMillis()))
}
} else {
// JobScheduler has been stopped.
}
} finally {
ssc.sparkContext.setLocalProperties(oldProps)
}
}
}