package kafka
import java.sql.DriverManager
import com.typesafe.config.ConfigFactory
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import scalikejdbc.{DB, SQL}
import scala.collection.mutable.Map
object OffsetManager {
val config = ConfigFactory.load()
def getConn ={
DriverManager.getConnection(config.getString("db.url"),
config.getString("db.user"),
config.getString("db.password"))
}
/**
* 获取指定用户的偏移量信息
*/
def apply(groupid :String,topic :String)={
/* val conn = getConn
val pstmt = conn.prepareStatement("SELECT * FROM streaming_offset WHERE groupid=? AND topic=?")
pstmt.setString(1,groupid)
pstmt.setString(2,topic)
val rs = pstmt.executeQuery()
val offsetRange = Map[TopicPartition,Long]()
while (rs.next()){
offsetRange += new TopicPartition(rs.getString("topic"),rs.getInt("partition")) -> rs.getLong("offset")
}
rs.close()
pstmt.close()
conn.close()
offsetRange*/
/**
* 使用scalikeJDBC优化代码
*/
DB.readOnly{ implicit session=>
SQL("SELECT * FROM streaming_offset WHERE groupid=? AND topic=?").bind(groupid,topic).map(rs=>{
new TopicPartition(rs.string("topic"),rs.int("partition")) -> rs.string("offset")
}).list().apply()
}.toMap
}
/**
* 保存当前批次的偏移量信息
* @param groupid
* @param offsetRange
*/
def saveCurrentBatchOffset(groupid :String ,offsetRange :Array[OffsetRange])={
/* val conn = getConn
val pstmt = conn.prepareStatement("replace into streaming_offset values(?,?,?,?)")
for (o <- offsetRange){
pstmt.setString(1,o.topic)
pstmt.setString(2,groupid)
pstmt.setLong(3,o.untilOffset)
pstmt.setInt(4,o.partition)
pstmt.executeUpdate()
}
pstmt.close()
conn.close()*/
DB.localTx{ implicit session=>
for (o <- offsetRange){
SQL("replace into streaming_offset values(?,?,?,?)").bind(o.topic,groupid,o.untilOffset,o.partition).update().apply()
}
}
}
}
package kafka
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.Map
object SSCDirectKafka010_MySql_Offset {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SSCDirectKafka010_ZK_Offset").setMaster("local[*]")
//在kafka中每次拉取的数据量,这里配置的3,并不是每次在kafka拉取3条数据,
//而是: 2 * 分区数量 * 采样时间
conf.set("spark.streaming.kafka.maxRatePerPartition","2")
//是否优雅的停止你的sparkStreaming,如果不加这个参数的话,服务停止的时候可能会造成数据的丢失
conf.set("spark.streaming.stopGracefullyOnShutdown","true")
val ssc = new StreamingContext(conf,Seconds(3))
//设置消费者组id
val groupId = "day_04"
//设置参数
val kafkaParams: Map[String, Object] = Map[String, Object](
"bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "earliest",
//"auto.commit.interval.ms"-> "1000",设置为1秒提交一次offset,默认是5秒
"enable.auto.commit" -> (false: java.lang.Boolean) //是否自动递交偏移量
)
//指定主题
val topic ="user"
val topics =Array(topic)
val offsetManage = OffsetManager(groupId,topic)
val stream: InputDStream[ConsumerRecord[String, String]] = if(offsetManage.size > 0){
//当大于0时,说明消费过了,所有可以获取到偏移量
KafkaUtils.createDirectStream[String,String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](topics,kafkaParams,offsetManage)
)
}else{
//说明没有消费过
KafkaUtils.createDirectStream[String,String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](topics,kafkaParams)
)
}
stream.foreachRDD(rdd=>{
//获取当前批次偏移量
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreach(println(_))
//将偏移量信息保存到mysql
OffsetManager.saveCurrentBatchOffset(groupId,offsetRanges)
})
ssc.start()
ssc.awaitTermination()
}
}