package com.asiainfo.spark.streaming
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object UrlCount {
val updateFunc = (iterator: Iterator[(String, Seq[Int], Option[Int])]) => {
iterator.flatMap{case(x,y,z)=> Some(y.sum + z.getOrElse(0)).map(n=>(x, n))}
}
def main(args: Array[String]) {
//接收命令行中的参数
val zkQuorum="192.168.111.3,192.168.111.5,192.168.111.6"
val groupId="spark-streaming"
val topics="test"
val numThreads="1"
val hdfs="hdfs://192.168.111.3:9000/kafka"
//val Array(zkQuorum, groupId, topics, numThreads, hdfs) = args
//创建SparkConf并设置AppName
val conf = new SparkConf().setAppName("UrlCount").setMaster("local")
//创建StreamingContext
val ssc = new StreamingContext(conf, Seconds(30))
//设置检查点
ssc.checkpoint(hdfs)
//设置topic信息
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
//重Kafka中拉取数据创建DStream
val lines = KafkaUtils.createStream(ssc, zkQuorum ,groupId, topicMap, StorageLevel.MEMORY_AND_DISK).map(_._2)
//切分数据,截取用户点击的url
val urls = lines.map(x=>(x.split(" ")(1), 1))
//统计URL点击量
val result = urls.updateStateByKey(updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), true)
//将结果打印到控制台
result.print()
ssc.start()
ssc.awaitTermination()
}
}
spark-streaming连接kafka(receiver)
最新推荐文章于 2024-05-06 19:24:56 发布
本文介绍如何使用 Spark Streaming 从 Kafka 拉取数据,并实现对用户点击 URL 的实时统计。通过配置 Spark 环境及 Kafka 信息,创建数据流,对数据进行切分处理,统计各个 URL 的点击次数。

961

被折叠的 条评论
为什么被折叠?



