package day02
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
/**
* 统计每个学科最受欢迎的老师前N名
*
*利用TreeSet进行数据优化
*/
object SubjectAndTeacher04 {
def main(args: Array[String]): Unit = {
//定义分区的
val topN=2
//设置本地运行
val conf =new SparkConf().setAppName("SubjectAndTeacher04").setMaster("local[*]")
//初始化对象
val sc =new SparkContext(conf)
//从HDFS上读取数据
val lines = sc.textFile("hdfs://hadoop01:9000/sparkTest")
val subjectAndTeacher = lines.map(line => {
val url = new URL(line)
val subject = url.getHost.substring(0, url.getHost.indexOf("."))
val teacher = url.getPath.substring(1)
((subject, teacher), 1)
})
//收集到所有的学科数据
val subjects: Array[String] = subjectAndTeacher.map(_._1._1).distinct().collect()
//自定义一个分区
val subjectPartition = new SubjectPartition(subjects)
//先进行局部聚合,再进行全局聚合
val reduced: RDD[((String, String), Int)] = subjectAndTeacher.reduceByKey(subjectPartition,_+_)
//将新分区后的数据进行排序处理
reduced.foreachPartition(partition=>{
//自定义一个排序规则
val ts = new mutable.TreeSet[((String,String),Int)]()(new SubjectOrdering())
//分区遍历
partition.foreach(item=>{
ts.add(item)
if(ts.size>topN){
ts.remove(ts.last)
//ts = ts.dropRight(1)
}
})
统计每个学科最受欢迎的老师前N名
最新推荐文章于 2021-11-18 21:13:59 发布