一、top 10热度中每个品类点击该品类中总次数的top10 session
需求分析:
在有品类热度前10的基础上,统计每个品类点击次数前10的session和每个session点击该品类的次数。
二、代码实现
伪代码:
1.获取前10品类
2.过滤非点击和非前10品类
3.每个品类每个session聚合点击次数
4.分品类聚合,每个session和其点击次数
5.每个品类取前10
代码如下(示例):
object CategroiesTop10ContainSessionTop10 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("cachAndPersist")
val sc = new SparkContext(sparkConf)
//读取数据
val orgRDD = sc.textFile("datas/user_visit_action.txt")
val top10: Array[String] = top10HotCateGroies(orgRDD) //top 10热门品类
val top10BroadCast=sc.broadcast(top10)
val filterRDD = orgRDD.filter {
case str => {
val datas = str.split("_")
if (datas(6) != "-1") {
if (top10BroadCast.value.contains(datas(6))) {
true
} else {
false
}
} else {
false
}
}
}
val reduceRDD: RDD[((String, String), Int)] = filterRDD.map {
case str => {
val datas = str.split("_")
((datas(6), datas(2)), 1)
}
}.reduceByKey(_ + _)
val groupByKeyRDD: RDD[(String, Iterable[(String, Int)])] = reduceRDD.map {
case ((cate, session), sum) => {
(cate, (session, sum))
}
}.groupByKey()
val takeRDD: RDD[(String, List[(String, Int)])] = groupByKeyRDD.mapValues {
case list => {
list.toList.sortBy(_._2)(Ordering.Int.reverse).take(10)
}
}
takeRDD.foreach(println)
sc.stop()
}
def top10HotCateGroies(orgRDD:RDD[String]) = {
val flatMapRDD = orgRDD.flatMap {
case str => {
val datas = str.split("_")
if (datas(6) != "-1") {
List((datas(6), (1, 0, 0)))
} else if (datas(8) != "null") {
val strings = datas(8).split(",")
strings.map((_, (0, 1, 0)))
} else if (datas(10) != "null") {
val strings = datas(10).split(",")
strings.map((_, (0, 0, 1)))
} else {
Nil
}
}
}
val reduceRDD = flatMapRDD.reduceByKey {
case (o, t) => {
(o._1 + t._1, o._2 + t._2, o._3 + t._3)
}
}
val resultRDD = reduceRDD.sortBy(_._2, false).take(10)
resultRDD.map(_._1)
}
}