为了记录在spark-streaming里面消费的数据的位置,我们需要保存offset来控制数据的读取位置,offset可以保存在多个地方,
1、将offset保存在外部系统(我用的是redis)
2、将offset存储在Hbase
3、将offset存储在Zookeeper
4、当历史数据没有用的时候 也可以不管理offset
--参考 https://www.jianshu.com/p/ef3f15cf400d
最近在将产品的spark版本由1.6升级到2.2版本之后 对应的其他包的版本也要升级,spark-streaming-kafka用的是0-10
下面是测试样例亲测可用
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
val ssc = new StreamingContext(conf, Seconds(1))
val brokers: String = "spark002:9092,spark003:9092,spark004:9092"
val sourceTopics = "data_to_kafka"
val topics = Array(sourceTopics)
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "use_a_separate_group_id_for_each_stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val kafkaStream = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
)
println("streaming_start======")
kafkaStream.foreachRDD(r => {
println("size=" + r.count())
r.foreach(s => println(s))
})
ssc.start()
ssc.awaitTermination()
}
对于kafka的offset,在每一个task完成之后我手动保存在了redis里面(现在还没有加入offset过期的判断),然后在每次createDirectStream的时候再将offset拿出来,具体实现如下:
创建kafkaStream的时候加上kafka的offset:
def getkafkaStream(redisConfig: RedisConfig, task: Task, sourceTopics: String, ssc: StreamingContext, createDirectStreamProps: Map[String, Object]) = {
val redis = JRedis.getResource(redisConfig)
var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null
var fromOffsets: Map[TopicPartition, Long] = Map()
var offsetHash = Map[String, String]()
if (redis.hexists(KafkaOffsetKey, task.Base.ID)) {
val result = redis.hget(KafkaOffsetKey, task.Base.ID)
offsetHash = JSON.parseFull(result).get.asInstanceOf[Map[String, String]]
}
if (offsetHash.nonEmpty) {
offsetHash.foreach { x =>
val tp = new TopicPartition(sourceTopics, x._1.toInt)
fromOffsets += (tp -> x._2.toLong)
}
kafkaStream = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Array(sourceTopics), createDirectStreamProps, fromOffsets)
)
} else {
kafkaStream = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(sourceTopics), createDirectStreamProps)
)
}
redis.close()
kafkaStream
}
保存kafka的offset:
def saveOffset(redisConfig: RedisConfig, rdd: RDD[ConsumerRecord[String, String]], id: String) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges //得到该 rdd 对应 kafka 的消息的 offset
val redis = JRedis.getResource(redisConfig: RedisConfig)
var offsetHash = Map[String, String]()
if (redis.hexists(KafkaOffsetKey, id)) {
val result = redis.hget(KafkaOffsetKey, id)
offsetHash = JSON.parseFull(result).get.asInstanceOf[Map[String, String]]
}
for (o <- offsetRanges) {
val path = s"${o.partition}".toString
val data = o.untilOffset.toString
offsetHash += (path -> data)
}
redis.hset(KafkaOffsetKey, id, JSONObject(offsetHash).toString())
redis.close()
}