sparkStreaming
基于spark core api 的扩展,用于处理流式数据处理
特点:高容错,可扩展,高流量,低延时
微批处理,1-10s ,每个微批都是一个RDD
用sparkSQL处理sparkStreaming
package kgc
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkSqlStreamingDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("hdfsDemo")
val ssc = new StreamingContext(conf,Seconds(5))
// TODO: 创建sparkSession对象
val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
// TODO: 创建一个输入流,
val input = ssc.socketTextStream("hadoop151",5678)
// TODO: 对输入的流进行操作
val wordDStream = input.flatMap(_.split(" "))
wordDStream.foreachRDD(
rdd=>{
if(rdd.count()!=0){
val df1 = rdd.map(x=>Word(x)).toDF()
df1.createOrReplaceTempView("words")
spark.sql(
"""
|select word,count(*)
|from words
|group by word
""".stripMargin
).show()
}
}
)
ssc.start()
ssc.awaitTermination()
}
}
case class Word(word:String)
flume sink到streaming处理
package kgc
import org.apache.spark.SparkConf
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkFllumePushDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("flumeDemo1").setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(5))
// TODO: push的方式
// val flumeStream = FlumeUtils.createStream(ssc,"hadoop151",55555)
// TODO: poll的方式 创建输入流
val flumeStream =
FlumeUtils.createPollingStream(ssc,"hadoop151",55555)
// TODO: 对输入流进行rdd处理
flumeStream.map(x=>new String(x.event.getBody.array()).trim).flatMap(_.split(" "))
.map((_,1))
.reduceByKey(_+_).print()
ssc.start()
ssc.awaitTermination()
}
}
编写flume .conf文件
agent.sources = s1
agent.channels = c1
agent.sinks = sk1
#设置Source的内省为netcat,使用的channel为c1
agent.sources.s1.type = netcat
agent.sources.s1.bind = hadoop101
agent.sources.s1.port = 44444
agent.sources.s1.channels = c1
#SparkSink,要求flume lib目录存在spark-streaming-flume-sink_2.11-x.x.x.jar
agent.sinks.sk1.type=org.apache.spark.streaming.flume.sink.SparkSink
agent.sinks.sk1.hostname=hadoop101
agent.sinks.sk1.port=55555
agent.sinks.sk1.channel = c1
#设置channel信息
#内存模式
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1000
启动
# 启动jar包的Streaming
spark-submit \
--class kgc.SparkFllumePushDemo \
streamingDemo0819-1.0-SNAPSHOT-jar-with-dependencies.jar
# 启动flume
flume-ng agent --name agent \
--conf-file ./sparkFlumeStreaming.conf -Dflume.root.logger=INFO,console &
# 启动一个端口产生数据
(base) [root@hadoop151 ~]# nc -lk 55555
streaming消费kafka数据
package kgc
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
object SparkKafkaDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("kafkadEMO")
val ssc = new StreamingContext(conf,Seconds(5))
// TODO: SparkStreaming消费kafka数据
//kafka的配置
val kafkaParams=Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG->"hadoop151:9092"), //主机地址
("value.deserializer"->"org.apache.kafka.common.serialization.StringDeserializer"), //反序列化
("key.deserializer"->"org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG->"kafkaGroup01") //消费组
)
// 创建输入流
val message: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc, //第一个参数streaming context
LocationStrategies.PreferConsistent, //位置 策略 ,首选 一致性
ConsumerStrategies.Subscribe(Set("testPartition2"), kafkaParams) //消费者 ,订阅 (主题,配置)
)
对流进行处理
message.map(x=>x.value()).flatMap(_.split(" "))
.map((_,1)).reduceByKey(_+_).print()
ssc.start()
ssc.awaitTermination()
}
}
streaming窗口处理
package cn.kgc.day0819.test10
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkStreamingWindowDemo extends App {
//TODO 创建一个spark StreamingContext对象
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("DEMO01")
val ssc = new StreamingContext(conf, Seconds(5))
//TODO 使用spark streaming来进行wordcount
val inputDstream: ReceiverInputDStream[String] = ssc.socketTextStream("hadoop151", 9999)
//TODO 对输入的流进行操作
// hadoop spark kafka
val wordDstream: DStream[String] = inputDstream.flatMap(_.split(" "))
val wordAndOneDstream: DStream[(String, Int)] = wordDstream.map((_, 1))
//窗口处理
val windowRes = wordAndOneDstream.reduceByKeyAndWindow(
(a: Int, b: Int) => (a + b),
// 与上面的5,秒数的倍数
Seconds(15), //处理数据的时间范围
Seconds(10) //处理数据的间隔
)
windowRes.print()
//TODO 通过start() 启动消息采集和处理
ssc.start()
//TODO 等待程序终止
ssc.awaitTermination()
}