import java.text.SimpleDateFormat
import java.util.Properties
import java.util.concurrent.{Callable, Executors, FutureTask}
import org.apache.kafka.clients.consumer.KafkaConsumer
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import scala.collection.JavaConversions._
/**
* Create by 写Scala的老刘 2019-05-13 09:43
*/
object GetDataByTimeStamp {
def main(args: Array[String]): Unit = {
val topic = "TestTopic"
val consumerProp = new Properties()
consumerProp.put("bootstrap.servers", "Ip:port")
consumerProp.put("group.id", "test")
consumerProp.put("enable.auto.commit", "false")
consumerProp.put("auto.commit.interval.ms", "1000")
//kafka权限认证的东西,如果您的kakfa没有配置权限认证,下面三行需要去掉
System.setProperty("java.security.auth.login.config", "/home/hadoop/Test/config/kafka_client_jaas.conf")
consumerProp.put("security.protocol", "SASL_PLAINTEXT")
consumerProp.put("sasl.mechanism", "PLAIN")
consumerProp.put("key.deserializer", classOf[StringDeserializer].getName)
//consumerProp.put("value.deserializer", classOf[ByteArrayDeserializer].getName)
consumerProp.put("value.deserializer", classOf[StringDeserializer].getName)
val kafkaConsumer = new KafkaConsumer(consumerProp)
//指定时间
val startTime = "2019-05-06 12:00:00"
val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val fetchTime = sdf.parse(startTime).getTime
//Utils工具类,下面会给出
val startOffset = Utils.getStartOffset(topic, fetchTime, consumerProp)
//val startOffset=collection.immutable.HashMap[Int,Long]()
println(startOffset)
//使用多线程的方式 由于kafka是3个分区,这里就创建3个线程即可
val threadPool = Executors.newFixedThreadPool(3)
for (i <- 0 to 2) {
try {
val future = new FutureTask(new Callable[String] {
override def call(): String = {
val tp = new TopicPartition(topic, i)
kafkaConsumer.assign(java.util.Arrays.asList(tp))
kafkaConsumer.seek(tp, startOffset.get(i).get)
while (true) {
val records=kafkaConsumer.poll(100)
println("records:"+records.size)
for(record<- records){
val data:String=record.value()
println(s"partition:${record.partition()} offset:${record.offset()} value:$data")
}
}
return "i am result"
}
})
threadPool.execute(future)
println(future.get())
} finally {
threadPool.shutdown()
}
}
}
}
Utils工具类:
import java.util
import java.util.{ArrayList, Properties}
import collection.mutable.Map
import org.apache.kafka.clients.consumer.KafkaConsumer
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.{ByteArrayDeserializer, StringDeserializer}
import scala.collection.JavaConversions._
object Utils {
def getStartOffset(topic:String,fetchDataTime:Long,properties: Properties ): Map[Int, Long] = {
//记录tp
val topicPartitions: ArrayList[TopicPartition] = new ArrayList[TopicPartition]
//记录(topic,分区) --->对应时间戳
val timestampToSearch: util.Map[TopicPartition, java.lang.Long] = new util.HashMap[TopicPartition, java.lang.Long]()
//记录分区和他对应的offset
val partitionOffset = Map[Int, Long]()
val consumer = new KafkaConsumer[String, Array[Byte]](properties)
//获取topic的partition信息 可以得到这个topic的所有partition 返回值是一个list[PartitionInfo] [0,1,2,3,4,5,,6,7,8,9]
val partitionInfos = consumer.partitionsFor(topic)
//将分区,topic 和时间戳保存到集合中 10个分区,都从fetchDataTime这个时间点开始找offset
for (partitionInfo <- partitionInfos) {
val tp = new TopicPartition(partitionInfo.topic(), partitionInfo.partition());
topicPartitions.add(tp)
timestampToSearch.put(tp, fetchDataTime)
}
//Map<TopicPartition, OffsetAndTimestamp> 根据时间戳找offset 返回的是一个java类型的map
val topicPartitionOffsetAndTimestampMap = consumer.offsetsForTimes(timestampToSearch)
//tp 和offset
for ((tp, offsetAndTimeStamp) <- topicPartitionOffsetAndTimestampMap) {
val partition = tp.partition()
//println(offsetAndTimeStamp.timestamp())
val offset = offsetAndTimeStamp.offset()
partitionOffset.put(partition, offset)
}
consumer.close()
partitionOffset
}
}