/** * Consumes messages from one or more topics in Kafka. * <checkPointDir>is the Spark Streaming checkpoint directory. * <brokers>is for bootstrapping and the producer will only use it for getting metadata * <topics>is a list of one or more kafka topics to consume from * <batchTime>is the Spark Streaming batch duration in seconds. */ object sparkStreaming_kafka_direct_10_HA {
def main(args: Array[String]) {
System.setProperty("HADOOP_USER_NAME", "hadoop") val ssc = createContext(args)
//The Streaming system starts. ssc.start() ssc.awaitTermination() }
val Array(checkPointDir, brokers, topics, batchTime) = args // Create a Streaming startup environment.
//集群方式 //val sparkConf = new SparkConf().setAppName("sparkStreaming_kafka_direct_10_HA")
//本地方式
val sparkConf = new SparkConf().setAppName("sparkStreaming_kafka_direct_10_HA").setMaster("local[2]") val ssc = new StreamingContext(sparkConf, Seconds(batchTime.toLong))
ssc.sparkContext.setLogLevel("WARN") //Configure the CheckPoint directory for the Streaming. //This parameter is mandatory because of existence of the window concept. ssc.checkpoint(checkPointDir)
// Get the list of topic used by kafka val topicArr = topics.split(",") val topicSet = topicArr.toSet
val locationStrategy = LocationStrategies.PreferConsistent val consumerStrategy = ConsumerStrategies.Subscribe[String, String](topicArr, kafkaParams)
// Create direct kafka stream with brokers and topics // Receive data from the Kafka and generate the corresponding DStream val stream = KafkaUtils.createDirectStream[String, String](ssc, locationStrategy, consumerStrategy)
//stream.map(record => (record.key, record.value)) val kafkaStreams: DStream[String] = stream.map(_.value()) val resultDStream: DStream[(String, Int)] = kafkaStreams .flatMap(_.split(" ")) .map((_, 1)) .updateStateByKey(updataFunc)