streaming程序记住kafka历史次数 I

● 此方法不建议使用

package com.ws.streaming
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
  * kafka将历史次数累加,该方法容易造成数据丢失,一旦此程序关闭重启将不会读取历史次数,而是从新开始
  */
object StateKafkaWordCount {
  /**
    * 参数一 : 在此处表示单词
    * 参数二 : 在此处表示该批次出现的次数集合
    * 参数三 : 在此处表示历史该单词累计出现次数,初始默认为0
    */
  val myFunction = (it: Iterator[(String, Seq[Int], Option[Int])]) => {
    //it.map(p => (p._1, p._2.sum + p._3.getOrElse(0)))
    it.map {
      case (x, y, z) => (x, y.sum + z.getOrElse(0))
    }
  }

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("KafkaWordCount")

    val ssc = new StreamingContext(conf, Seconds(5))

    //此参数不指定将报错,将历史累加次数存入本地磁盘目录/集群需要写入hdfs等
    ssc.checkpoint("./cache")

    //老版本的消费者需要连接zk,新版本不需要(0.10)
    val zk = "hadoop-01:2181,hadoop-02:2181,hadoop-03:2181"
    val groupId = "g1"

    val topic = Map[String, Int]("ws" -> 1)

    //创建kafka DStream
    val kafakaDStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, zk, groupId, topic)

    //ReceiverInputDStream[(String, String)]是一个元组,key是写入的key,value是实际写入的数据
    val data: DStream[String] = kafakaDStream.map(_._2)

    val result: DStream[(String, Int)] = data.flatMap(_.split(" ")).map((_, 1)).updateStateByKey(myFunction, new HashPartitioner(ssc.sparkContext.defaultMinPartitions), true)
    result
    result.print()
    ssc.start()
    ssc.awaitTermination()
  }
}

● 写入数据

[root@hadoop-03 kafka_2.11-0.8.2.2]# ./bin/kafka-console-producer.sh --broker-list hadoop-01:9092,hadoop-02:9092,hadoop-03:9092 -topic ws
[2018-09-15 18:09:01,454] WARN Property topic is not valid (kafka.utils.VerifiableProperties)
nice hadoop hive
hive hadoop spark

● 结果

-------------------------------------------
Time: 1539137195000 ms
-------------------------------------------
(hive,1)
(hadoop,1)
(nice,1)

 WARN [Thread-14] - Expecting 1 replicas with only 0 peer/s.
 WARN [Thread-14] - Block input-0-1539137195600 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1539137200000 ms
-------------------------------------------
(hive,2)
(spark,1)
(hadoop,2)
(nice,1)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值