SparkStreaming读取Kafka 0.10.x的Json数据存储到Hbase
数据是JSON格式的字符串,使用隐式转换,转成对象,然后存储Hbase
package com.xxx.sparkStreaming
import java.util.Date
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.kafka010.{KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.json4s._
import org.json4s.jackson.JsonMethods._
import scala.collection.mutable
object Kafka2Hbase {
class LOG(val method: String, val className: String, val file: String, val line: Int, val time: Int, val thread: String, val ip: String, val pool: String, val env: String)
var zookeeperservers = "192.168.62.34:2181,192.168.62.35:2181,192.168.62.36:2181"
// hbase的相关配置
val hbaseconf: Configuration = HBaseConfiguration.create()
hbaseconf.set("hbase.zookeeper.quorum", zookeeperservers)
hbaseconf.set("hbase.zookeeper.property.clientPort", "2181")
def main(args: Array[String]): Unit = {
if (args.length != 4) {
System.err.println("Usage: KafkaStreamingApp <zkQuorum> <group> <topics> <numThreads>")
}
val topics = Array("logfile")
val sparkConf = new SparkConf()
.setAppName("KafkaReceiverWordCount")
.setMaster("local[2]")
val sc = new SparkContext(sparkConf)
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc, Durations.seconds(5))
//val sc = ssc.sparkContext
//sc.setLogLevel("WARN")
val kafkaParams = new mutable.HashMap[String, Object]()
kafkaParams.put("metadata.broker.list", "192.168.62.75:9092")
kafkaParams.put("bootstrap.servers", "192.168.62.75:9092")
kafkaParams.put("group.id", "testGroup")
kafkaParams.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
val messages = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
).map(record => (record.key() , record.value())).map(_._2)
messages.foreachRDD(Rdd => {
Rdd.foreachPartition(partition => {
var connection: Connection = ConnectionFactory.createConnection(hbaseconf)
val table = connection.getTable(TableName.valueOf("logfile"))
partition.foreach(line => {
//导入隐式转换
implicit val formats = DefaultFormats
val logData: LOG = parse(line).extract[LOG]
println(logData.method)
val put = new Put(Bytes.toBytes(String.valueOf(new Date().getTime) + "_" + logData.time))
put.addColumn(Bytes.toBytes("ss"), Bytes.toBytes("method"), Bytes.toBytes(logData.method))
put.addColumn(Bytes.toBytes("ss"), Bytes.toBytes("className"), Bytes.toBytes(logData.className))
put.addColumn(Bytes.toBytes("ss"), Bytes.toBytes("file"), Bytes.toBytes(logData.file))
put.addColumn(Bytes.toBytes("ss"), Bytes.toBytes("ip"), Bytes.toBytes(logData.ip))
put.addColumn(Bytes.toBytes("ss"), Bytes.toBytes("line"), Bytes.toBytes(logData.line.toInt))
put.addColumn(Bytes.toBytes("ss"), Bytes.toBytes("thread"), Bytes.toBytes(logData.thread))
put.addColumn(Bytes.toBytes("ss"), Bytes.toBytes("pool"), Bytes.toBytes(logData.pool))
put.addColumn(Bytes.toBytes("ss"), Bytes.toBytes("env"), Bytes.toBytes(logData.env))
table.put(put)
})
table.close()
})
})
ssc.start()
ssc.awaitTermination()
}
}