● 此方法不建议使用
package com.ws.streaming
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* kafka将历史次数累加,该方法容易造成数据丢失,一旦此程序关闭重启将不会读取历史次数,而是从新开始
*/
object StateKafkaWordCount {
/**
* 参数一 : 在此处表示单词
* 参数二 : 在此处表示该批次出现的次数集合
* 参数三 : 在此处表示历史该单词累计出现次数,初始默认为0
*/
val myFunction = (it: Iterator[(String, Seq[Int], Option[Int])]) => {
//it.map(p => (p._1, p._2.sum + p._3.getOrElse(0)))
it.map {
case (x, y, z) => (x, y.sum + z.getOrElse(0))
}
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("KafkaWordCount")
val ssc = new StreamingContext(conf, Seconds(5))
//此参数不指定将报错,将历史累加次数存入本地磁盘目录/集群需要写入hdfs等
ssc.checkpoint("./cache")
//老版本的消费者需要连接zk,新版本不需要(0.10)
val zk = "hadoop-01:2181,hadoop-02:2181,hadoop-03:2181"
val groupId = "g1"
val topic = Map[String, Int]("ws" -> 1)
//创建kafka DStream
val kafakaDStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, zk, groupId, topic)
//ReceiverInputDStream[(String, String)]是一个元组,key是写入的key,value是实际写入的数据
val data: DStream[String] = kafakaDStream.map(_._2)
val result: DStream[(String, Int)] = data.flatMap(_.split(" ")).map((_, 1)).updateStateByKey(myFunction, new HashPartitioner(ssc.sparkContext.defaultMinPartitions), true)
result
result.print()
ssc.start()
ssc.awaitTermination()
}
}
● 写入数据
[root@hadoop-03 kafka_2.11-0.8.2.2]# ./bin/kafka-console-producer.sh --broker-list hadoop-01:9092,hadoop-02:9092,hadoop-03:9092 -topic ws
[2018-09-15 18:09:01,454] WARN Property topic is not valid (kafka.utils.VerifiableProperties)
nice hadoop hive
hive hadoop spark
● 结果
-------------------------------------------
Time: 1539137195000 ms
-------------------------------------------
(hive,1)
(hadoop,1)
(nice,1)
WARN [Thread-14] - Expecting 1 replicas with only 0 peer/s.
WARN [Thread-14] - Block input-0-1539137195600 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1539137200000 ms
-------------------------------------------
(hive,2)
(spark,1)
(hadoop,2)
(nice,1)