主要思路
将数据收集到driver端,利用redis的Pipeline写数据和偏移量,与第一种mysql的类似
但是redis pipline只有单机版redis或者主备redis才支持,集群版的redis不支持
package com.ws.sparkstreaming.kafkaredis
import com.ws.sparkstreaming.utils.{JedisPool, OffsetUtils}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.{Jedis, Pipeline}
object WordCountJoinKafkaRedisManagerOffset {
def main(args: Array[String]): Unit = {
val appname = "wc_redis"
val group = "group1"
val topic = Array("wc1").toIterable
val conf = new SparkConf().setAppName(appname).setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(5))
ssc.sparkContext.setLogLevel("WARN")
val kafkaParam = Map(
"bootstrap.servers" -> "dream1:9092,dream2:9092,dream3:9092", // kafka地址
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", // 设置反序列化组件
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"group.id" -> group, // 消费者组
"auto.offset.reset" -> "earliest", // 指定消费者从哪开始消费[latest,earliest]
"enable.auto.commit" -> "false" // 是否自动提交偏移量,默认是true
)
val lastoffset = OffsetUtils.selectOffsetFromRedis(appname, group)
val dstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topic, kafkaParam, lastoffset))
dstream.foreachRDD((kafkaRdd: RDD[ConsumerRecord[String, String]]) => {
val ranges = kafkaRdd.asInstanceOf[HasOffsetRanges].offsetRanges
println("------------------------------------" + System.currentTimeMillis() + "------------------------------------")
if (!kafkaRdd.isEmpty()) {
// 转换数据
val res: RDD[(String, Int)] = kafkaRdd.map(_.value()).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
// 收集到driver端
val wordAndNums = res.collect()
var jedis: Jedis = null
var pipeline: Pipeline = null
try {
jedis = JedisPool.getConnection()
// 设置redis数据库
jedis.select(1)
// 获得pipeline
pipeline = jedis.pipelined()
// 开启事务
pipeline.multi()
wordAndNums.foreach(word => {
pipeline.hincrBy(appname, word._1, word._2)
})
for (elem <- ranges) {
pipeline.hset(appname + "_" + group, elem.topic + "_" + elem.partition, elem.untilOffset.toString)
}
// 提交事务
pipeline.exec()
pipeline.sync()
} catch {
case exception: Exception => {
ssc.stop()
// 回滚
pipeline.discard()
exception.printStackTrace()
}
} finally {
if (pipeline != null) {
jedis.close()
}
if (jedis != null) {
jedis.close()
}
}
}
})
ssc.start()
ssc.awaitTermination()
}
}
Jedis连接池
package com.ws.sparkstreaming.utils
import redis.clients.jedis.{JedisPool, JedisPoolConfig}
object JedisPool {
val config = new JedisPoolConfig()
config.setMaxTotal(20)
config.setMaxIdle(10)
config.setTestOnBorrow(true)
val pool = new JedisPool(config, "dream1", 6379, 10000)
def getConnection() = {
pool.getResource
}
}
Jedis读取偏移量
package com.ws.sparkstreaming.utils
import java.sql.Connection
import java.util
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import scala.collection.mutable
object OffsetUtils {
def selectOffsetFromRedis(appname: String, goupId: String): Map[TopicPartition, Long] = {
val jedis = JedisPool.getConnection()
jedis.select(1)
val kvs: util.Map[String, String] = jedis.hgetAll(appname + "_" + goupId)
import scala.collection.JavaConversions._
val result = new mutable.HashMap[TopicPartition, Long]()
for (kv<-kvs){
val k = kv._1
val offset = kv._2
val topic = k.split("_")(0)
val partition = k.split("_")(1).toInt
val tp = new TopicPartition(topic, partition)
result(tp)= offset.toLong
}
result.toMap
}
}
本文介绍如何使用Spark Streaming从Kafka接收数据,并利用Redis进行高效的数据存储与消费偏移量管理,以实现实时数据处理。重点在于Redis Pipeline的应用和集群环境下的挑战。
278

被折叠的 条评论
为什么被折叠?



