一、窗口函数
package window
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object WindowDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("Windows")
val streamingContext = new StreamingContext(conf,Seconds(2))
streamingContext.checkpoint("checkpoint")
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.XXX.100:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->"org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->"org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG->"kafkaGroup4")
)
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
streamingContext,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafka"), kafkaParams)
)
val wind1 = kafkaStream.flatMap(_.value().toString.split("\\s+")).map(x=>(x,1))
.reduceByKeyAndWindow((x:Int,y:Int)=>{println("one");x+y},(x:Int,y:Int)=>{println("two");x+y},Seconds(8),Seconds(4))
wind1.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
二、transform
package window
import java.text.SimpleDateFormat
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object WindowDemo2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("Windows")
val streamingContext = new StreamingContext(conf,Seconds(2))
streamingContext.checkpoint("checkpoint")
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.XXX.100:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->"org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->"org.apache.kafka.common.serialization.StringDeserializer"),
ConsumerConfig.GROUP_ID_CONFIG->"kafkaGroup4"
)
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
streamingContext,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafka"), kafkaParams)
)
val trans: DStream[((String, String), Int)] = kafkaStream.transform((rdd, timestamp) => {
val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val time: String = sdf.format(timestamp.milliseconds)
val value = rdd.flatMap(x => x.value().split("\\s+").map(x => ((x, time), 1))).reduceByKey(_ + _)
value
})
kafkaStream.transform((x,y)=>{
x.flatMap(_.value().split("\\s+").map(x=>(x,1)))
})
trans.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}