1.pom.xml
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.compat.version>2.11</scala.compat.version>
<!--编译时的编码-->
<maven.compiler.encoding>UTF-8</maven.compiler.encoding>
<!--文件拷贝时的编码-->
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<scala.version>2.11.8</scala.version>
<spark.version>2.2.0</spark.version>
</properties>
<!--scala插件的存储仓库-->
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
<dependencies>
<!--scala编程依赖库sdk-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>provided</scope>
</dependency>
<!--provided表示maven打包时排除该jar包,如果集群上存在的话-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<!-- 指定源码包和测试包的位置 -->
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<!--scala插件,让maven能够编译、测试、运行scala项目-->
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.5</arg>
</args>
</configuration>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</reporting>
2.updateStateByKey
package sparkStream
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
object UpdateStateByKeyOperation {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(this.getClass.getName)
.setMaster("local[2]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc,Seconds(5))
ssc.checkpoint("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\checkPointDir")
val dStream1: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.226.88",6666)
val dStream2: DStream[String] = dStream1.flatMap(_.split(" "))
val dStream3: DStream[(String, Int)] = dStream2.map((_,1))
//当前批次统计(无状态)
val dStream4: DStream[(String, Int)] = dStream3.reduceByKey((v1, v2) => v1 + v2)
//历史批次统计(有状态)
val updateStateByKeyDstream1 = dStream3.updateStateByKey(updateFunc1)
/**
* def updateStateByKey[S: ClassTag](
* updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
* partitioner: Partitioner, //分区器
* rememberPartitioner: Boolean //true表示保存父RDD的信息
* ): DStream[(K, S)]
*/
val updateStateByKeyDstream2 = dStream3.updateStateByKey(updateFunc2,new HashPartitioner(sc.defaultMinPartitions),false)
updateStateByKeyDstream1.print()
ssc.start()
ssc.awaitTermination()
}
/**
* @Author: qwerdf@QAQ
* @Description: updateFunc: (Seq[V], Option[S]) => Option[S]
* Seq[V] --> 当前批次中相同key的value序列,这里就是Seq(1,1,1...)
* Option[S] --> 历史批次相同key的value累计,初次buffer中没有记录,需要用Option类型封装
* 函数返回Option[S] --> 当前和历史批次相同key累加的结果
* @Date: 2020/8/9
* @Param null:
* @return: null
**/
val updateFunc1: (Seq[Int], Option[Int]) => Option[Int] = (seq,buffer) => {
//当前批次相同单词次数
val sumed = seq.sum
//取出历史批次相同单词的累加次数,并和当前批次相加
val buffered: Int = buffer.getOrElse(0) + sumed
//封装Int类型
Option(buffered)
}
/**
* @Author: qwerdf@QAQ
* @Description: updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)]
* K --> 相同key值
* Seq[V] --> 当前批次中相同key的value序列,这里就是Seq(1,1,1...)
* Option[S] --> 历史批次相同key的value累计,初次buffer中没有记录,需要用Option类型封装
* 函数返回Iterator[(K, S)]
* @Date: 2020/8/9
* @Param null:
* @return: null
**/
val updateFunc2: Iterator[(String, Seq[Int], Option[Long])] => Iterator[(String, Long)] = iter => {
val res: Iterator[(String, Long)] = iter.map {
tuple => {
val word = tuple._1 //获取相同单词作为key值
val sumed = tuple._2.sum.toLong
val resultcount = tuple._3.getOrElse(0L) + sumed
(word, resultcount)
}
}
res
}
//一个入参时候,括号不能少
val updateFunc2_1 = (iter: Iterator[(String, Seq[Int], Option[Long])]) => {
}
}
3.reduceByKeyAndWindow (spark1.6版本)
/**
* Return a new DStream by applying `reduceByKey` over a sliding window. This is similar to
* `DStream.reduceByKey()` but applies it over a sliding window. Hash partitioning is used to
* generate the RDDs with Spark's default number of partitions.
* @param reduceFunc associative reduce function
* @param windowDuration width of the window; must be a multiple of this DStream's
* batching interval
* @param slideDuration sliding interval of the window (i.e., the interval after which
* the new DStream will generate RDDs); must be a multiple of this
* DStream's batching interval
*/
def reduceByKeyAndWindow(
reduceFunc: (V, V) => V,
windowDuration: Duration,//窗口持续时间
slideDuration: Duration //窗口滑动时间
): DStream[(K, V)] = ssc.withScope {
reduceByKeyAndWindow(reduceFunc, windowDuration, slideDuration, defaultPartitioner())
}
3.1简单Demo
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
object WindowOperation {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(this.getClass.getName)
.setMaster("local[2]")
val ssc = new StreamingContext(conf,Milliseconds(5000))
ssc.checkpoint("F:\\ideaProjects\\sparkStreaming-api\\src\\main\\resources\\checkpoint")
val dStream1: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.226.88",6666)
//val dStream2: DStream[(String, Int)] = dStream1.flatMap(_.split(" ")).map((_,1))
//transform原语转换对RDD的操作
val dStream2: DStream[(String, Int)] = dStream1.transform(rdd => rdd.flatMap(_.split(" ")).map((_,1)))
//入参:reduceFunc表示聚合函数,
// Milliseconds(5000*3)表示窗口持续时间,
// Milliseconds(5000*2)表示窗口滑动时间,即每隔多久展示一次
//这两个时间必须为批处理时间的整数倍
val dStream3 = dStream2.reduceByKeyAndWindow(reduceFunc,Milliseconds(5000*3),Milliseconds(5000*2))
dStream3.print()
ssc.start()
ssc.awaitTermination()
}
//入参(Int, Int)表示相同Key的Value1、Value2
val reduceFunc: (Int, Int) => Int = {
(value1,value2) => value1 + value2
}
}
4.window (更加通用)
package sparkStream
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object WindowOperation {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[2]")
.getOrCreate()
val sc = spark.sparkContext
val ssc = new StreamingContext(sc, Seconds(5))
val inputDstream: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.226.88", 6666)
/**
* 窗口操作:
* def window(windowDuration: Duration, slideDuration: Duration): DStream[T] = ssc.withScope {
* new WindowedDStream(this, windowDuration, slideDuration)
* }
* 说明:
* windowDuration--窗口持续时间--15s
* slideDuration--滑动时间--10s
* 这两个参数都必须为数据采集周期的整数倍。当滑动时间小于持续时间,数据会被重复计算。
* 默认每个批次数据生成一个RDD,然后封装成一个Dstream,如果是窗口操作,一个Dstream应该包含多个RDD的时间序列,
* 这里窗口持续时间为15s,应该生成3个RDD,然后封装成一个Dstream,
* 所以一个Dstream包含一个或多个RDD,可以通过foreachRDD()遍历所有RDD
**/
val windowDstream = inputDstream.window(Seconds(15), Seconds(10))
//对15s这段周期的数据进行处理
val tupleDstream: DStream[((String, String), Int)] = windowDstream.map {
line => {
val arr = line.split(" ")
val provence = arr(0)
val city = arr(1)
val area = arr(2)
((provence, city), 1)
}
}
val reduceByKeyDstream = tupleDstream.reduceByKey(_ + _)
reduceByKeyDstream.print()
ssc.start() //启动数据采集器
ssc.awaitTermination() //Driver端等待采集器采集数据
}
}