整合flume-kafka-sparkStreaming完整代码-有状态更新updateStateByKey

最新推荐文章于 2021-12-05 12:10:10 发布

_JohnnyChu

最新推荐文章于 2021-12-05 12:10:10 发布

阅读量409

点赞数

分类专栏：大数据文章标签：整合flume-kafka-sparkStreaming完整有状态更新的 updateStateByKey

本文链接：https://blog.youkuaiyun.com/JohnnyChu/article/details/103947196

版权

整合flume-kafka-sparkStreaming完整代码:

本人学习笔记，不提供基础知识讲解。

本文实现效果是：

监控hadoop01节点的/home/hadoop/logs/flume.log，当该文件有内容追加时，将追加内容发送到hadoop02的44444端口，

hadoop02节点监控到44444有消息时，将消息push到kafka集群的topic为flume-kafka下。

sparkstreaming进行流式计算单词数。

注意：

根据自己的情况对参数进行调整!!!

代码：

package com.jtv.sparkStreaming_kafka
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
* Consumes messages from one or more topics in Kafka.
* <checkPointDir> is the Spark Streaming checkpoint directory.
* <brokers> is for bootstrapping and the producer will only use it for getting metadata
* <topics> is a list of one or more kafka topics to consume from
* <batchTime> is the Spark Streaming batch duration in seconds.
*/
object sparkStreaming_kafka_direct_10_HA {
    def main(args: Array[String]) {
        System.setProperty("HADOOP_USER_NAME", "hadoop")
        val ssc = createContext(args)

        //The Streaming system starts.
        ssc.start()
        ssc.awaitTermination()
    }

    def createContext(args: Array[String]) : StreamingContext = {

        if (args.length != 4) {
            System.err.println("Usage: DstreamKafkaCount<checkPointDir> <brokers> <topic> <batchTime>")
            System.exit(1)
        }

        val Array(checkPointDir, brokers, topics, batchTime) = args
        // Create a Streaming startup environment.

//集群方式
//val sparkConf = new SparkConf().setAppName("sparkStreaming_kafka_direct_10_HA")

//本地方式

val sparkConf = new SparkConf().setAppName("sparkStreaming_kafka_direct_10_HA").setMaster("local[2]")
        val ssc = new StreamingContext(sparkConf, Seconds(batchTime.toLong))

        ssc.sparkContext.setLogLevel("WARN")
        //Configure the CheckPoint directory for the Streaming.
        //This parameter is mandatory because of existence of the window concept.
        ssc.checkpoint(checkPointDir)

        // Get the list of topic used by kafka
        val topicArr = topics.split(",")
        val topicSet = topicArr.toSet

        val kafkaParams = Map[String, Object](
            "bootstrap.servers" -> brokers,
            "value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
            "key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
            "group.id" -> "DemoConsumer",
            "auto.offset.reset" -> "latest",
            "enable.auto.commit" -> (false: java.lang.Boolean)
        )

        val locationStrategy = LocationStrategies.PreferConsistent
        val consumerStrategy = ConsumerStrategies.Subscribe[String, String](topicArr, kafkaParams)

        // Create direct kafka stream with brokers and topics
        // Receive data from the Kafka and generate the corresponding DStream
        val stream = KafkaUtils.createDirectStream[String, String](ssc, locationStrategy, consumerStrategy)

        //stream.map(record => (record.key, record.value))
        val kafkaStreams: DStream[String] = stream.map(_.value())
        val resultDStream: DStream[(String, Int)] = kafkaStreams
              .flatMap(_.split(" "))
                .map((_, 1))
                .updateStateByKey(updataFunc)

        resultDStream.print()

        ssc
    }

    def updataFunc(values : Seq[Int], state : Option[Int]) : Option[Int] =
        Some(values.sum + state.getOrElse(0))
}