目录
窗口函数,就是在DStream流上,以一个可配置的长度为窗口,以一个可配置的速率向前移动窗口,根据窗口函数的具体内容,分别对当前窗口中的这一波数据采取某个对应的操作算子。
需要注意的是窗口长度,和窗口移动速率需要是batch time的整数倍。
1.window(windowLength, slideInterval)
该操作由一个DStream对象调用,传入一个窗口长度参数,一个窗口移动速率参数,然后将当前时刻当前长度窗口中的元素取出形成一个新的DStream。
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object sparkWindowDemo {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setMaster("local[*]").setAppName("demo")
//采集周期batch time,指定的2秒为每次采集的时间间隔
val streamingContext = new StreamingContext(sparkConf,Seconds(2))
streamingContext.checkpoint("/in/checkPoint/")
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.184.40:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "kafkaGroup")
)
val kafkaStream:InputDStream[ConsumerRecord[String,String]] =
KafkaUtils.createDirectStream(
streamingContext,
//本地策略,可用的执行器上均匀分布
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParams)
)
//window窗口,可加第二个参数,参数是batch time的整数倍,滑动窗口
//一个参数 x秒内出现几次
//两个参数 x秒加前一窗口滑动y秒出现次数 有重复数据计算
val numStream = kafkaStream.flatMap(_.value().toString.split("\\s+"))
.map((_, 1)).window(Seconds(x),Seconds(y))
numStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
2.countByWindow(windowLength,slideInterval)
返回窗口内出现元素个数,注意:需要设置checkpoint
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object sparkWindow2 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setMaster("local[*]").setAppName("demo")
//采集周期,指定的2秒为每次采集的时间间隔
val streamingContext = new StreamingContext(sparkConf,Seconds(2))
streamingContext.checkpoint("/in/checkPoint/")
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.184.40:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "kafkaGroup2")
)
val kafkaStream:InputDStream[ConsumerRecord[String,String]] =
KafkaUtils.createDirectStream(
streamingContext,
//本地策略,可用的执行器上均匀分布
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParams)
)
//countByWindow 返回指定窗口中元素个数
val numStream = kafkaStream.flatMap(_.value().toString.split("\\s+"))
.map((_, 1)).countByWindow(Seconds(8),Seconds(4))
numStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
3.countByValueAndWindow
统计窗口中元素相同的个数
注意:需要设置checkpoint
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object sparkWindowDemo3 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setMaster("local[*]").setAppName("demo")
//采集周期,指定的2秒为每次采集的时间间隔
val streamingContext = new StreamingContext(sparkConf,Seconds(2))
streamingContext.checkpoint("/in/checkPoint/")
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.184.40:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "kafkaGroup3")
)
val kafkaStream:InputDStream[ConsumerRecord[String,String]] =
KafkaUtils.createDirectStream(
streamingContext,
//本地策略,可用的执行器上均匀分布
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParams)
)
val numStream = kafkaStream.flatMap(_.value().toString.split("\\s+"))
.countByValueAndWindow(Seconds(8),Seconds(4))
numStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
4.reduceByWindow(func, windowLength,slideInterval)
在调用DStream上首先取窗口函数的元素形成新的DStream,然后在窗口元素形成的DStream上进行reduce
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object sparkWindowDemo4 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setMaster("local[*]").setAppName("demo")
//采集周期,指定的2秒为每次采集的时间间隔
val streamingContext = new StreamingContext(sparkConf,Seconds(2))
streamingContext.checkpoint("/in/checkPoint/")
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.184.40:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "kafkaGroup4")
)
val kafkaStream:InputDStream[ConsumerRecord[String,String]]
= KafkaUtils.createDirectStream(
streamingContext,
//本地策略,可用的执行器上均匀分布
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParams)
)
val numStream = kafkaStream.flatMap(_.value().toString.split("\\s+"))
.reduceByWindow(_+":"+_,Seconds(8),Seconds(4))
numStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
5.reduceByKeyAndWindow(func,windowLength, slideInterval, [numTasks])
reduceByKeyAndWindow的数据源是基于该DStream的窗口长度中的所有数据进行计算。该操作有一个可选的并发数参数。
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object sparkTransformDemo {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setMaster("local[*]").setAppName("demo")
//采集周期,指定的2秒为每次采集的时间间隔
val streamingContext = new StreamingContext(sparkConf,Seconds(2))
streamingContext.checkpoint("/in/checkPoint/")
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.184.40:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "kafkaGroup5")
)
val kafkaStream:InputDStream[ConsumerRecord[String,String]]
= KafkaUtils.createDirectStream(
streamingContext,
//本地策略,可用的执行器上均匀分布
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParams)
)
val numStream: DStream[(String, Int)] = kafkaStream
.flatMap(_.value().toString.split("\\s+")).map((_, 1))
.reduceByKeyAndWindow((a: Int, b: Int) => a + b, (a:Int,b:Int)=>a-b ,Seconds(8), Seconds(4))
numStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
输出:
-------------------------------------------
Time: 1608704534000 ms
-------------------------------------------
(a,1)
-------------------------------------------
Time: 1608704538000 ms
-------------------------------------------
(a,2)
-------------------------------------------
Time: 1608704542000 ms
-------------------------------------------
(a,1)
(b,1)
-------------------------------------------
Time: 1608704546000 ms
-------------------------------------------
(a,0)
(b,2)
-------------------------------------------
Time: 1608704550000 ms
-------------------------------------------
(a,0)
(b,1)
-------------------------------------------
Time: 1608704554000 ms
-------------------------------------------
(a,0)
(b,0)