使用Spark Streaming读取Kafka时,要引用如下包
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.10</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId>
<version>1.6.1</version>
</dependency>
以下是使用createDirectStream读取Kafka,该方法比createStream方法读取速度快很多!
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.kafka._
import kafka.serializer.StringDecoder
import kafka.serializer.DefaultDecoder
import org.apache.spark.serializer.KryoSerializer
object DirectKafka
{
def main(args: Array[String])
{
val brokers = "192.168.2.5:9092"
val topics = "topic-name"
val sparkConf = new SparkConf().set("spark.driver.cores", "8").setAppName("DirectKafka")
val ssc = new StreamingContext(sparkConf, Seconds(1))
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String](
"metadata.broker.list" -> brokers
)
val messages = KafkaUtils.createDirectStream[String, Array[Byte], StringDecoder, DefaultDecoder](
ssc, kafkaParams, topicsSet).map(_._2)
messages.foreachRDD( x => println("each RDD count : " + x.count()) )
ssc.start()
ssc.awaitTermination()
}
}