1.目的
对netcat输出的单词按时间5s分片,每隔10s计算过去60s内输出次数前三的单词
2.素材
启动linux上的netcat程序
nc -lk 9999
不断输入字符
bbb hhh fff mmm bbb bbb hhh
3.代码
/**
* Created by puwenchao on 2017-09-04.
*/
package Streaming
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
object streaming_window {
def main(args:Array[(String)]): Unit ={
//设定日志等级
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
//创建上下文,设置batch时间间隔5s
val conf = new SparkConf().setAppName("streamingwcall").setMaster("local[2]")
val ssc = new StreamingContext(conf, Seconds(5))
//从netcat获取RDD并转为key\value形式
val kv = ssc.socketTextStream("192.168.252.141", 9999, StorageLevel.MEMORY_ONLY)
.flatMap(line=>line.split(" ")).map(word=>(word,1))
//窗口长度设60s,滑动间隔设10s,即每隔十秒聚合过去60秒的数据(12个RDD)。
val cnt=kv.reduceByKeyAndWindow((v1: Int, v2: Int) => v1 + v2, Seconds(60), Seconds(10))
//按输出次数排序,取排名前三的单词
val finl=cnt.transform(wcRDD=>{
val sorted=wcRDD.map(tuple=>(tuple._2,tuple._1)).sortByKey(false)
.map(tuple=>(tuple._2,tuple._1))
val top3=sorted.take(3)
println("-------print top 3 begin--------")
top3.foreach(println)
println("-------print top 3 end--------")
sorted
})
finl.print()
ssc.start()
ssc.awaitTermination()
}
}
4.输出
-------------------------------------------
Time: 1504515180000 ms
-------------------------------------------
(bbb,3)
(hhh,2)
(fff,1)