SparkStreaming之StreamingContext

最新推荐文章于 2024-11-18 20:58:24 发布

Luke Lan

最新推荐文章于 2024-11-18 20:58:24 发布

阅读量1.1k

点赞数

CC 4.0 BY-SA版权

分类专栏：大数据文章标签： spark

本文链接：https://blog.youkuaiyun.com/u014733604/article/details/86500033

大数据专栏收录该内容

8 篇文章

订阅专栏

本文深入探讨了Spark Streaming的核心功能，包括start方法启动流程、stop方法的停止机制、textFileStream方法创建数据流，以及StreamingContext的生命周期管理。通过详细分析，揭示了Spark Streaming在大数据实时处理中的关键技术和实现细节。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.start

SparkStreaming 入口类，提供了大量根据不同数据源创建DStream的方法。
def start(): Unit = synchronized {
  state match {
  //INITIALIZED表示context被创建，但是并不没有开始它。输入流，转换，和输出操作都被创建。
    case INITIALIZED =>
//设置代表用户代码位置。这里用到了AtomicReference这个类，这个类的好处是安全，CAS的。当出现ABA问题是就没用啦。当然这里就不用，应该ABA问题这里不会发生。因为启动关闭再启动就是另一个了。
      startSite.set(DStream.getCreationSite())
//定义了一个对象锁，表示同一个时刻一定只有一个人在做创建动作。启动时的保护guards activation 
      StreamingContext.ACTIVATION_LOCK.synchronized {
//检查一下，有再跑的话直接抛出异常
        StreamingContext.assertNoOtherContextIsActive()
        try {
          validate()
//产生一个新的线程来启动sparkstreaming程序
          ThreadUtils.runInNewThread("streaming-start") {
//用于覆盖操作和RDDs的调用站点
            sparkContext.setCallSite(startSite.get)
//清除线程的group id和描述 description（spark.job.description，spark.jobGroup.id，spark.job.interruptOnCancel）
            sparkContext.clearJobGroup()
//这个就是kill job啦，这是设置是false
            sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
//加载环境信息，这个localProperties用了InheritableThreadLocal类，这个类重写了childValue方法，make a clone such that changes in the parent properties aren't reflected in the those of the children threads（这样的克隆使得父类属性的变化不会反映在子线程的属性中）
           savedProperties.set(SerializationUtils.clone(sparkContext.localProperties.get()))
//启动调度
            scheduler.start()
          }
//设置状态为ACTIVE
          state = StreamingContextState.ACTIVE
        } catch {
          case NonFatal(e) =>
            logError("Error starting the context, marking it as stopped", e)
            scheduler.stop(false)
            state = StreamingContextState.STOPPED
            throw e
        }
//设置StreamingContext
        StreamingContext.setActiveContext(this)
      }
      logDebug("Adding shutdown hook") // force eager creation of logger
//加了一个优先级的钩子，怎么加的呢?核心代码如下:
/**private class SparkShutdownHook(private val priority: Int, hook: () => Unit)
  extends Comparable[SparkShutdownHook] {

  override def compareTo(other: SparkShutdownHook): Int = {
    other.priority - priority
  }
  def run(): Unit = hook()
}**/   shutdownHookRef = ShutdownHookManager.addShutdownHook(
        StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
      // Registering Streaming Metrics at the start of the StreamingContext
      assert(env.metricsSystem != null)
//注册度量系统
      env.metricsSystem.registerSource(streamingSource)
      uiTab.foreach(_.attach())
      logInfo("StreamingContext started")
//开始跑着的状态，还没有被停止。输入流，转换，和输出操作都不能被创建。
    case ACTIVE =>
      logWarning("StreamingContext has already been started")
    case STOPPED =>
      throw new IllegalStateException("StreamingContext has already been stopped")
  }
}

2.stop

def stop(stopSparkContext: Boolean, stopGracefully: Boolean): Unit = {
  var shutdownHookRefToRemove: AnyRef = null
  if (LiveListenerBus.withinListenerThread.value) {
    throw new SparkException(
      s"Cannot stop StreamingContext within listener thread of ${LiveListenerBus.name}")
  }
  synchronized {
    // The state should always be Stopped after calling `stop()`, even if we haven't started yet
    state match {
      case INITIALIZED =>
        logWarning("StreamingContext has not been started yet")
        state = STOPPED
      case STOPPED =>
        logWarning("StreamingContext has already been stopped")
        state = STOPPED
      case ACTIVE =>
       
        Utils.tryLogNonFatalError {
          scheduler.stop(stopGracefully)
        }
        
        Utils.tryLogNonFatalError {
          env.metricsSystem.removeSource(streamingSource)
        }
        Utils.tryLogNonFatalError {
          uiTab.foreach(_.detach())
        }
 //设置为空
        StreamingContext.setActiveContext(null)
        Utils.tryLogNonFatalError {
          waiter.notifyStop()
        }
        if (shutdownHookRef != null) {
          shutdownHookRefToRemove = shutdownHookRef
          shutdownHookRef = null
        }
        logInfo("StreamingContext stopped successfully")
        state = STOPPED
    }
  }
  if (shutdownHookRefToRemove != null) {
/**
 private val hooks = new PriorityQueue[SparkShutdownHook]()
从优先队列中移除掉
 **/
    ShutdownHookManager.removeShutdownHook(shutdownHookRefToRemove)
  }
  // Even if we have already stopped, we still need to attempt to stop the SparkContext because
  // a user might stop(stopSparkContext = false) and then call stop(stopSparkContext = true).
 //sparkContext给停止掉 
 if (stopSparkContext) sc.stop()
}

3.textFileStream

def textFileStream(directory: String): DStream[String] = withNamedScope("text file stream") { fileStream[LongWritable, Text, TextInputFormat](directory).map(_._2.toString) }

4.stopOnShutdown

private def stopOnShutdown(): Unit = {
  val stopGracefully = conf.getBoolean("spark.streaming.stopGracefullyOnShutdown", false)
  logInfo(s"Invoking stop(stopGracefully=$stopGracefully) from shutdown hook")
  //并不是停止StreamingContext，而是停止其他的操作
  stop(stopSparkContext = false, stopGracefully = stopGracefully)
}

5.awaitTermination

def awaitTermination() {
  waiter.waitForStopOrError()
}
def waitForStopOrError(timeout: Long = -1): Boolean = {
//一看这个ReentrantLock，基本上满足某个条件设置的。
  lock.lock()
  try {
    if (timeout < 0) {
      while (!stopped && error == null) {
        condition.await()
      }
    } else {

      var nanos = TimeUnit.MILLISECONDS.toNanos(timeout)
      while (!stopped && error == null && nanos > 0) {
//nanosTimeout指定该方法等待信号的的最大时间（单位为纳秒）。若指定时间内收到signal()或signalALL()则返回nanosTimeout减去已经等待的时间；
//若指定时间内有其它线程中断该线程，则抛出InterruptedException并清除当前线程的打断状态；若指定时间内未收到通知，则返回0或负数。
        nanos = condition.awaitNanos(nanos)
      }
    }
    // If already had error, then throw it
    if (error != null) throw error
    // already stopped or timeout
    stopped
  } finally {
    lock.unlock()
  }
}