一、添加依赖
创建maven项目添加依赖包
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.6.6</version>
</dependency>
二、从端口采集
package sparkStreaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
object sparkStreamDemo {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setMaster("local[*]").setAppName("demo")
//采集周期,指定的5秒为每次采集的时间间隔
val streamingContext = new StreamingContext(sparkConf,Seconds(5))
//指定采集的方法
val socketLineStream =
streamingContext.socketTextStream("192.168.184.40",7777)
//将采集的信息进行处理---以wordCount为例,具体可替换业务逻辑
val sumStream =
socketLineStream.flatMap(_.split("\\s+"))
.map(x=>(x,1)).reduceByKey(_+_)
//打印
sumStream.print()
//启动采集器
streamingContext.start()
streamingContext.awaitTermination()
}
}
测试:
运行代码后,启动nc后输入数据
nc -lk 7777
三、从文件采集
package sparkStreaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
object sparkStreamFileDataSourceDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]").setAppName("filedemo")
//指定5秒采集一次
val streamingContext = new StreamingContext(conf,Seconds(5))
//调用textFileStream()方法生成一个文件流类型的InputStream
val fileDStream = streamingContext.textFileStream("G:/tmp/")
//流计算逻辑
val wordStream = fileDStream.flatMap(_.split("\\s+"))
val mapStream = wordStream.map(x=>(x,1))
val sumStream = mapStream.reduceByKey(_+_)
sumStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
测试:
启动代码后编辑文件拉入指定文件夹
四、从kafka中拉取
package sparkStreaming
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object sparkStreamKafkaSource {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]").setAppName("kafkaDemo")
val streamingContext = new StreamingContext(conf,Seconds(5))
//kafka配置信息
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.184.40:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG, "kafkaGroup1")
)
val kafkaStream:InputDStream[ConsumerRecord[String,String]]
= KafkaUtils.createDirectStream(
streamingContext,
//本地策略,在可用的 exec 上均匀分布
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParams)
)
val wordStream =
kafkaStream.flatMap(_.value().toString.split("\\s+"))
val mapStream = wordStream.map((_,1))
val sumStream = mapStream.reduceByKey(_+_)
sumStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
测试:
创建topic --> sparkKafkaDemo
kafka-topics.sh --zookeeper 192.168.184.40:2181 --create --topic sparkKafkaDemo --partitions 1 --replication-factor
生产者
kafka-console-producer.sh --topic sparkKafkaDemo --broker-list 192.168.184.40:9092
运行代码后生产数据