SparkStreaming常用操作_spark streaming常用方法-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_43690478/article/details/106954121

1.pom.xml

<properties>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
    <scala.compat.version>2.11</scala.compat.version>

    <!--编译时的编码-->
    <maven.compiler.encoding>UTF-8</maven.compiler.encoding>
    <!--文件拷贝时的编码-->
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>

    <scala.version>2.11.8</scala.version>
    <spark.version>2.2.0</spark.version>
</properties>


<!--scala插件的存储仓库-->
<pluginRepositories>
    <pluginRepository>
        <id>scala-tools.org</id>
        <name>Scala-tools Maven2 Repository</name>
        <url>http://scala-tools.org/repo-releases</url>
    </pluginRepository>
</pluginRepositories>

<dependencies>

    <!--scala编程依赖库sdk-->
    <dependency>
        <groupId>org.scala-lang</groupId>
        <artifactId>scala-library</artifactId>
        <version>${scala.version}</version>
        <scope>provided</scope>
    </dependency>

    <!--provided表示maven打包时排除该jar包,如果集群上存在的话-->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
        <version>${spark.version}</version>
        <scope>provided</scope>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming_2.11</artifactId>
        <version>${spark.version}</version>
        <scope>provided</scope>
    </dependency>
</dependencies>


<build>
    <!-- 指定源码包和测试包的位置 -->
    <sourceDirectory>src/main/scala</sourceDirectory>
    <testSourceDirectory>src/test/scala</testSourceDirectory>
    <plugins>
        <!--scala插件,让maven能够编译、测试、运行scala项目-->
        <plugin>
            <groupId>org.scala-tools</groupId>
            <artifactId>maven-scala-plugin</artifactId>
            <executions>
                <execution>
                    <goals>
                        <goal>compile</goal>
                        <goal>testCompile</goal>
                    </goals>
                </execution>
            </executions>

            <configuration>
                <scalaVersion>${scala.version}</scalaVersion>
                <args>
                    <arg>-target:jvm-1.5</arg>
                </args>
            </configuration>
        </plugin>
    </plugins>
</build>

<reporting>
    <plugins>
        <plugin>
            <groupId>org.scala-tools</groupId>
            <artifactId>maven-scala-plugin</artifactId>
            <configuration>
                <scalaVersion>${scala.version}</scalaVersion>
            </configuration>
        </plugin>
    </plugins>
</reporting>

2.updateStateByKey

package sparkStream

import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}

object UpdateStateByKeyOperation {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setAppName(this.getClass.getName)
      .setMaster("local[2]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc,Seconds(5))
    ssc.checkpoint("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\checkPointDir")

    val dStream1: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.226.88",6666)
    val dStream2: DStream[String] = dStream1.flatMap(_.split(" "))
    val dStream3: DStream[(String, Int)] = dStream2.map((_,1))
    //当前批次统计(无状态)
    val dStream4: DStream[(String, Int)] = dStream3.reduceByKey((v1, v2) => v1 + v2)

    //历史批次统计(有状态)
    val updateStateByKeyDstream1 = dStream3.updateStateByKey(updateFunc1)

    /**
      * def updateStateByKey[S: ClassTag](
      * updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
      * partitioner: Partitioner,    //分区器
      * rememberPartitioner: Boolean  //true表示保存父RDD的信息
      * ): DStream[(K, S)]
      */
    val updateStateByKeyDstream2 = dStream3.updateStateByKey(updateFunc2,new HashPartitioner(sc.defaultMinPartitions),false)

    updateStateByKeyDstream1.print()

    ssc.start()
    ssc.awaitTermination()
  }

  /**
   * @Author: qwerdf@QAQ
   * @Description: updateFunc: (Seq[V], Option[S]) => Option[S]
    *           Seq[V] --> 当前批次中相同key的value序列,这里就是Seq(1,1,1...)
    *           Option[S] --> 历史批次相同key的value累计,初次buffer中没有记录，需要用Option类型封装
    *           函数返回Option[S] --> 当前和历史批次相同key累加的结果
   * @Date: 2020/8/9
   * @Param null:
   * @return: null
   **/
  val updateFunc1: (Seq[Int], Option[Int]) => Option[Int] = (seq,buffer) => {
    //当前批次相同单词次数
    val sumed = seq.sum
    //取出历史批次相同单词的累加次数,并和当前批次相加
    val buffered: Int = buffer.getOrElse(0) + sumed
    //封装Int类型
    Option(buffered)
  }


  /**
   * @Author: qwerdf@QAQ
   * @Description: updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)]
    *              K --> 相同key值
    *              Seq[V] --> 当前批次中相同key的value序列,这里就是Seq(1,1,1...)
    *              Option[S] --> 历史批次相同key的value累计,初次buffer中没有记录，需要用Option类型封装
    *              函数返回Iterator[(K, S)]
   * @Date: 2020/8/9
   * @Param null:
   * @return: null
   **/

  val updateFunc2: Iterator[(String, Seq[Int], Option[Long])] => Iterator[(String, Long)] = iter => {
    val res: Iterator[(String, Long)] = iter.map {
      tuple => {
        val word = tuple._1 //获取相同单词作为key值
        val sumed = tuple._2.sum.toLong
        val resultcount = tuple._3.getOrElse(0L) + sumed
        (word, resultcount)
      }
    }
    res
  }

  //一个入参时候,括号不能少
  val updateFunc2_1 = (iter: Iterator[(String, Seq[Int], Option[Long])]) => {

  }
}

3.reduceByKeyAndWindow （spark1.6版本）

/**
 * Return a new DStream by applying `reduceByKey` over a sliding window. This is similar to
 * `DStream.reduceByKey()` but applies it over a sliding window. Hash partitioning is used to
 * generate the RDDs with Spark's default number of partitions.
 * @param reduceFunc associative reduce function
 * @param windowDuration width of the window; must be a multiple of this DStream's
 *                       batching interval
 * @param slideDuration  sliding interval of the window (i.e., the interval after which
 *                       the new DStream will generate RDDs); must be a multiple of this
 *                       DStream's batching interval
 */
def reduceByKeyAndWindow(
    reduceFunc: (V, V) => V,
    windowDuration: Duration,//窗口持续时间
    slideDuration: Duration //窗口滑动时间
  ): DStream[(K, V)] = ssc.withScope {
  reduceByKeyAndWindow(reduceFunc, windowDuration, slideDuration, defaultPartitioner())
}

3.1简单Demo

import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Milliseconds, StreamingContext}

object WindowOperation {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setAppName(this.getClass.getName)
      .setMaster("local[2]")
    val ssc = new StreamingContext(conf,Milliseconds(5000))

    ssc.checkpoint("F:\\ideaProjects\\sparkStreaming-api\\src\\main\\resources\\checkpoint")

    val dStream1: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.226.88",6666)

    //val dStream2: DStream[(String, Int)] = dStream1.flatMap(_.split(" ")).map((_,1))

    //transform原语转换对RDD的操作
    val dStream2: DStream[(String, Int)] = dStream1.transform(rdd => rdd.flatMap(_.split(" ")).map((_,1)))

    //入参：reduceFunc表示聚合函数，
    //      Milliseconds(5000*3)表示窗口持续时间，
    //      Milliseconds(5000*2)表示窗口滑动时间，即每隔多久展示一次
    //这两个时间必须为批处理时间的整数倍
    val dStream3 = dStream2.reduceByKeyAndWindow(reduceFunc,Milliseconds(5000*3),Milliseconds(5000*2))

    dStream3.print()

    ssc.start()
    ssc.awaitTermination()
  }

  //入参(Int, Int)表示相同Key的Value1、Value2
  val reduceFunc: (Int, Int) => Int = {
    (value1,value2) => value1 + value2
  }

}

4.window (更加通用)

package sparkStream

import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object WindowOperation {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName(this.getClass.getName)
      .master("local[2]")
      .getOrCreate()
    val sc = spark.sparkContext
    val ssc = new StreamingContext(sc, Seconds(5))

    val inputDstream: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.226.88", 6666)

    /**
      * 窗口操作：
      * def window(windowDuration: Duration, slideDuration: Duration): DStream[T] = ssc.withScope {
      *   new WindowedDStream(this, windowDuration, slideDuration)
      * }
      * 说明：
      * windowDuration--窗口持续时间--15s
      * slideDuration--滑动时间--10s
      * 这两个参数都必须为数据采集周期的整数倍。当滑动时间小于持续时间，数据会被重复计算。
      * 默认每个批次数据生成一个RDD，然后封装成一个Dstream，如果是窗口操作,一个Dstream应该包含多个RDD的时间序列，
      * 这里窗口持续时间为15s，应该生成3个RDD，然后封装成一个Dstream，
      * 所以一个Dstream包含一个或多个RDD，可以通过foreachRDD()遍历所有RDD
      **/
    val windowDstream = inputDstream.window(Seconds(15), Seconds(10))

    //对15s这段周期的数据进行处理
    val tupleDstream: DStream[((String, String), Int)] = windowDstream.map {
      line => {
        val arr = line.split(" ")
        val provence = arr(0)
        val city = arr(1)
        val area = arr(2)
        ((provence, city), 1)
      }
    }

    val reduceByKeyDstream = tupleDstream.reduceByKey(_ + _)


    reduceByKeyDstream.print()

    ssc.start() //启动数据采集器
    ssc.awaitTermination() //Driver端等待采集器采集数据
  }
}