1.start
SparkStreaming 入口类,提供了大量根据不同数据源创建DStream的方法。 def start(): Unit = synchronized { state match { //INITIALIZED表示context被创建,但是并不没有开始它。输入流,转换,和输出操作都被创建。 case INITIALIZED => //设置代表用户代码位置。这里用到了AtomicReference这个类,这个类的好处是安全,CAS的。当出现ABA问题是就没用啦。当然这里就不用,应该ABA问题这里不会发生。因为启动关闭再启动就是另一个了。 startSite.set(DStream.getCreationSite()) //定义了一个对象锁,表示同一个时刻一定只有一个人在做创建动作。启动时的保护guards activation StreamingContext.ACTIVATION_LOCK.synchronized { //检查一下,有再跑的话直接抛出异常 StreamingContext.assertNoOtherContextIsActive() try { validate() //产生一个新的线程来启动sparkstreaming程序 ThreadUtils.runInNewThread("streaming-start") { //用于覆盖操作和RDDs的调用站点 sparkContext.setCallSite(startSite.get) //清除线程的group id和描述 description(spark.job.description,spark.jobGroup.id,spark.job.interruptOnCancel) sparkContext.clearJobGroup() //这个就是kill job啦,这是设置是false sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false") //加载环境信息,这个localProperties用了InheritableThreadLocal类,这个类重写了childValue方法,make a clone such that changes in the parent properties aren't reflected in the those of the children threads(这样的克隆使得父类属性的变化不会反映在子线程的属性中) savedProperties.set(SerializationUtils.clone(sparkContext.localProperties.get())) //启动调度 scheduler.start() } //设置状态为ACTIVE state = StreamingContextState.ACTIVE } catch { case NonFatal(e) => logError("Error starting the context, marking it as stopped", e) scheduler.stop(false) state = StreamingContextState.STOPPED throw e } //设置StreamingContext StreamingContext.setActiveContext(this) } logDebug("Adding shutdown hook") // force eager creation of logger //加了一个优先级的钩子,怎么加的呢?核心代码如下: /**private class SparkShutdownHook(private val priority: Int, hook: () => Unit) extends Comparable[SparkShutdownHook] { override def compareTo(other: SparkShutdownHook): Int = { other.priority - priority } def run(): Unit = hook() }**/ shutdownHookRef = ShutdownHookManager.addShutdownHook( StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown) // Registering Streaming Metrics at the start of the StreamingContext assert(env.metricsSystem != null) //注册度量系统 env.metricsSystem.registerSource(streamingSource) uiTab.foreach(_.attach()) logInfo("StreamingContext started") //开始跑着的状态,还没有被停止。输入流,转换,和输出操作都不能被创建。 case ACTIVE => logWarning("StreamingContext has already been started") case STOPPED => throw new IllegalStateException("StreamingContext has already been stopped") } }
2.stop
def stop(stopSparkContext: Boolean, stopGracefully: Boolean): Unit = { var shutdownHookRefToRemove: AnyRef = null if (LiveListenerBus.withinListenerThread.value) { throw new SparkException( s"Cannot stop StreamingContext within listener thread of ${LiveListenerBus.name}") } synchronized { // The state should always be Stopped after calling `stop()`, even if we haven't started yet state match { case INITIALIZED => logWarning("StreamingContext has not been started yet") state = STOPPED case STOPPED => logWarning("StreamingContext has already been stopped") state = STOPPED case ACTIVE => Utils.tryLogNonFatalError { scheduler.stop(stopGracefully) } Utils.tryLogNonFatalError { env.metricsSystem.removeSource(streamingSource) } Utils.tryLogNonFatalError { uiTab.foreach(_.detach()) } //设置为空 StreamingContext.setActiveContext(null) Utils.tryLogNonFatalError { waiter.notifyStop() } if (shutdownHookRef != null) { shutdownHookRefToRemove = shutdownHookRef shutdownHookRef = null } logInfo("StreamingContext stopped successfully") state = STOPPED } } if (shutdownHookRefToRemove != null) { /** private val hooks = new PriorityQueue[SparkShutdownHook]() 从优先队列中移除掉 **/ ShutdownHookManager.removeShutdownHook(shutdownHookRefToRemove) } // Even if we have already stopped, we still need to attempt to stop the SparkContext because // a user might stop(stopSparkContext = false) and then call stop(stopSparkContext = true). //sparkContext给停止掉 if (stopSparkContext) sc.stop() }
3.textFileStream
def textFileStream(directory: String): DStream[String] = withNamedScope("text file stream") { fileStream[LongWritable, Text, TextInputFormat](directory).map(_._2.toString) }
4.stopOnShutdown
private def stopOnShutdown(): Unit = { val stopGracefully = conf.getBoolean("spark.streaming.stopGracefullyOnShutdown", false) logInfo(s"Invoking stop(stopGracefully=$stopGracefully) from shutdown hook") //并不是停止StreamingContext,而是停止其他的操作 stop(stopSparkContext = false, stopGracefully = stopGracefully) }
5.awaitTermination
def awaitTermination() { waiter.waitForStopOrError() } def waitForStopOrError(timeout: Long = -1): Boolean = { //一看这个ReentrantLock,基本上满足某个条件设置的。 lock.lock() try { if (timeout < 0) { while (!stopped && error == null) { condition.await() } } else { var nanos = TimeUnit.MILLISECONDS.toNanos(timeout) while (!stopped && error == null && nanos > 0) { //nanosTimeout指定该方法等待信号的的最大时间(单位为纳秒)。若指定时间内收到signal()或signalALL()则返回nanosTimeout减去已经等待的时间; //若指定时间内有其它线程中断该线程,则抛出InterruptedException并清除当前线程的打断状态;若指定时间内未收到通知,则返回0或负数。 nanos = condition.awaitNanos(nanos) } } // If already had error, then throw it if (error != null) throw error // already stopped or timeout stopped } finally { lock.unlock() } }