Spark分区与排序实践-优快云博客

 val isLocal = args(0).toBoolean
    val conf = new SparkConf().setAppName(this.getClass.getCanonicalName)
    if (isLocal) {
      conf.setMaster("local[*]")
    }
    val sc = new SparkContext(conf)
    //指定以后从哪里读取数据创建RDD
    val lines = sc.textFile(args(1))
    val topN = args(2).toInt
    //对数据进行整理
    val reduced: RDD[((String, String), Int)] = lines.map(line => {
      val fields = line.split("/")
      val url = fields(2)
      val teacher = fields(3)
      val subject = url.split("[.]")(0)
      ((subject, teacher), 1)
    }).reduceByKey(_ + _)
    //计算学科的数量 distinct之后里面有多少种类的学科,就有多少,然后collect,写到磁盘上
    val subjects: Array[String] = reduced.map(_._1._1).distinct().collect()
    /*println(subjects.toBuffer)   //ArrayBuffer(javaee, php, bigdata)
    sc.stop()*/

    //初始化分区器 分区器创建好了
    val partitioner: SubjectPartitioner = new SubjectPartitioner(subjects)
    //创建好分区器之后,开始调用分区器
    //按照学科进行分区
    val partitioined: RDD[((String, String), Int)] = reduced.partitionBy(partitioner)

    //把每个分区的数据拿出来
    val result1: RDD[((String, String), Int)] = partitioined.mapPartitions(it => {
      it.toList.sortBy((-_._2)).take(topN).iterator //输入的是迭代器,返回的也是迭代器
    })
    println(result1.collect().toBuffer)
  sc.stop()
  }
}
//定义分区器,传入成员变量方便 以后传进去
class SubjectPartitioner(val subjects:Array[String]) extends Partitioner{
  //这里面一定要传进去传进去多少个分区,有多少个学科就有多少个分区
  //在柱构造器中定义分区规则,HashMap里面装的是学科名字和分区编号
  //柱构造器在new 的时候只会执行一次
  private val nameTonum: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]()
 var i = 0
  for(sub<-subjects){
    //这个是scala里面中 sub 当初key  i 当初v  put 到HashMap里面去
    nameTonum(sub)=i   //拿到一个,就把数据加进去
    i+=1   //
  }
  //这时候就设置好了(javaee,0),(php,1),(bigdate,2)
  override def numPartitions: Int = subjects.length//3

  //executor在Task中,shufflewriter之前会调用
  override def getPartition(key: Any): Int ={
    //asInstanceOf将引用指定为子类的引用
    val tuple: (String, String) = key.asInstanceOf[(String, String)]
    val subject: String = tuple._1
    nameTonum(subject) //根据学科获取相对应的编号

    //判断是否本地运行
    val isLocal: Boolean = args(0).toBoolean
    //加载配置文件
    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getCanonicalName)
    //if语句执行判断
    if(isLocal){
      conf.setMaster("local[*]")
    }
    //创建连接
    val sc: SparkContext = new SparkContext(conf)
    //指定文件的路径,创建RDD
    val lines: RDD[String] = sc.textFile(args(1))
    //传入一个topN
    val topN: Int = args(2).toInt

    //对文件进行操作
    val reduced: RDD[((String, String), Int)] = lines.map(it => {
      val fields: Array[String] = it.split("/")
      val teacher: String = fields(3)
      val subjects: String = fields(2).split("[.]")(0)
      ((subjects, teacher), 1)
    }) reduceByKey (_ + _)
   /* val result: mutable.Buffer[((String, String), Int)] = reduced.collect().toBuffer
    println(result)*/

    //有通过subject进行分区
    //先找出subject的种类
    val subject: Array[String] = reduced.map(_._1._1).distinct().collect()

    //new一个定义了的构造器
    val partitioner: SubjectPartitioner2 = new SubjectPartitioner2(subject)

    //reduced调用分区器
    val partitioned: RDD[((String, String), Int)] = reduced.partitionBy(partitioner)
    //使用一个特殊的集合进行排序
    //TreeSet里面是一个Ordering的规则
    //然后修改排序规则,通过隐式转换,按照自己想要的规则进行排序
    //foreach把迭代器里面的数据遍历出来,也就是吧m每个分区里面的数据取出来
    val result2: RDD[((String, String), Int)] = partitioned.mapPartitions(t => {
      //定义一个隐式转换修改排序规则
      implicit val rules: Ordering[((String, String), Int)] = Ordering[Int].on[((String, String), Int)](t => -t._2)
      //new TreeSet 里面有自动排序规则 会自动找上面的隐式转换
      val sorted: mutable.TreeSet[((String, String), Int)] = new mutable.TreeSet[((String, String), Int)]()  
      sorted.foreach(t => {
        //将数据添加到TreeSet中
        sorted += t //相当于add
        if (sorted.size > topN) {
          sorted -= sorted.last
        }
      })
      sorted.iterator //输入的是迭代器,输出的也是迭代器
    })
    println(result2.collect().toBuffer)
  }
}
//自定义一个分区,然后以subject为分区规则,把subject传进去
class SubjectPartitioner2(var subject: Array[String]) extends Partitioner{
  //定义一个柱构造器,这个柱构造器每new 一次就执行一次
  private val nameToNum: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]()
  //通过遍历subject把学科,及其对应的编号传入HashMap中
  var i = 0
  for (sub<-subject){
    nameToNum(sub)=i  //这种形式在scala中就是把subject 0,1,2  put到map中
    i+=1
  }
  //Array(("javaee",0),("php",1),("bigdate",2))
  override def numPartitions: Int = subject.length  //求出数组的长度

  override def getPartition(key: Any): Int = {
    //因为传进来的是any类型,但是我们只需要(String,String) 元组
    //所以就需要强转
    val tuple: (String, String) = key.asInstanceOf[(String, String)]

    //获取subject
    val subject: String = tuple._1
    //取出subject对应的编号
    nameToNum(subject)

上传一种新的比较规则

val isLocal: Boolean = args(0).toBoolean
    //加载配置文件
    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getCanonicalName)
    //if语句执行判断
    if(isLocal){
      conf.setMaster("local[*]")
    }
    //创建连接
    val sc: SparkContext = new SparkContext(conf)
    //指定文件的路径,创建RDD
    val lines: RDD[String] = sc.textFile(args(1))
    //传入一个topN
    val topN: Int = args(2).toInt

    //对文件进行操作
    val subjectAndOne: RDD[((String, String), Int)] = lines.map(it => {
      val fields: Array[String] = it.split("/")
      val teacher: String = fields(3)
      val subjects: String = fields(2).split("[.]")(0)
      ((subjects, teacher), 1)
    })
    val subjects: Array[String] = subjectAndOne.map(_._1._1).distinct().collect()

    //初始分化区
    val subPartitioned: SubjectPartitioner6 = new SubjectPartitioner6(subjects)


    //调用一个算子,可以重新分区,并且每一个分区内排序

    //制定一个比较规则,参与比较的字段必须是在key里面
    implicit val orderingRules: Ordering[((String, String), Int)] with Object = new Ordering[((String, String), Int)]{
      override def compare(x: ((String, String), Int), y: ((String, String), Int)): Int = {
        -(x._2 - y._2)
      }
    }
    //注意这个是按照元组的比较规则进行比较,先按第一个进行比较,如果第一个不相等,那就直接比较第一个字段,如果相等在比较第二个字段,一次比较下去
    //知道找到不相等的元组
    //如果数据比较简单就用这种排序方法 on里面调用的也是 compare方法
   // implicit val rules = Ordering[Int].on[((String, String), Int)](t=>t._2)
    //既可以shuffle又可以排序
    val result: RDD[((String, String), Int)] = subjectAndOne.repartitionAndSortWithinPartitions(subPartitioned)
    println(result.collect().toBuffer)

  }
}
//自定义一个分区,然后以subject为分区规则,把subject传进去
class SubjectPartitioner6(var subject: Array[String]) extends Partitioner{
  //定义一个柱构造器,这个柱构造器每new 一次就执行一次
  private val nameToNum: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]()
  //通过遍历subject把学科,及其对应的编号传入HashMap中
  var i = 0
  for (sub<-subject){
    nameToNum(sub)=i  //这种形式在scala中就是把subject 0,1,2  put到map中
    i+=1
  }
  //Array(("javaee",0),("php",1),("bigdate",2))
  override def numPartitions: Int = subject.length  //求出数组的长度

  override def getPartition(key: Any): Int = {
    //因为传进来的是any类型,但是我们只需要(String,String) 元组
    //所以就需要强转
    val tuple: (String, String) = key.asInstanceOf[(String, String)]

    //获取subject
    val subject: String = tuple._1
    //取出subject对应的编号
    nameToNum(subject)

shuffledRDD实现 repartitionAndSortWithPartition的功能

  def main(args: Array[String]): Unit = {
    //判断是否在本地运行
    val isLocal: Boolean = args(0).toBoolean
    //创建本地连接
    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getCanonicalName)
    //判断为本地运行
    if(isLocal){
      conf.setMaster("local[*]")
    }
    //建立连接
    val sc: SparkContext = new SparkContext(conf)
    //指定传输路径,创建RDD
    val lines: RDD[String] = sc.textFile(args(1))  //把每一行的数据传到数组数组里面
    //topN,输出
    val topN: Int = args(2).toInt
    //对数据进行处理
    val reduced: RDD[((String, String), Int)] = lines.map(line => {
      val fields: Array[String] = line.split("/")
      val teacher: String = fields(3)
      val subject: String = fields(2).split("[.]")(0)
      ((subject, teacher), 1)
    }).reduceByKey(_ + _)
    //对数据进行去重和把数据写到磁盘里面
    val subjects: Array[String] = reduced.map(_._1._1).distinct().collect()
    //对reduced进行处理
    val maped: RDD[((String, String, Int), Null)] = reduced.map(t => {
      ((t._1._1, t._1._2, t._2), null)
    })

    //把数据放到自定义的分区器里面
    val partitioner: SubjectPartitioner7 = new SubjectPartitioner7(subjects)
    //要制定一个排序规则,new Ordering就要实现Ordering的方法
   implicit val rulesRodering: Ordering[(String, String, Int)] = new Ordering[(String, String, Int)] {
      override def compare(x: (String, String, Int), y: (String, String, Int)): Int = {
        -(x._3-y._3) //倒序
      }
    }

    //new ShullfeRDD实现reparitionAndSortWithPartition的功能
    //key是tuple3  value 是null   null,与null 合并的结果是null
    //不要忘记shuffledRDD里面一套传进去key ;类型,value:类型,combiner类型,他只是把数据放到一起,但不会排序
    val shuffled: ShuffledRDD[(String, String, Int), Null, Null] = new ShuffledRDD[(String, String, Int), Null, Null](maped, partitioner)
    //把数据和分区器放到一起执行
    println(shuffled.collect().toBuffer)


  }
}
//自定义一个分区器,按照谁分区,就把水传进来了
class SubjectPartitioner7(val subjects:Array[String]) extends Partitioner{
  //定义一个柱构造器,指定分区器得规则,每new 一次这里就执行一次
  //先创建一个HashMap,存放分区器机器分区编号,hashMap里面需要传入一个隐式转换
  private val rulesRodering: Ordering[(String, String, Int)]= Ordering[Int].on[(String, String, Int)](t=>t._3)
  private val rules: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]()
  var i = 0
  for (sub<-subjects){
    rules(sub)=i
    i+=1

  }

  override def numPartitions: Int = subjects.length

  override def getPartition(key: Any): Int = {
    val tuple: (String, String, Int) = key.asInstanceOf[(String, String, Int)]
    val subject: String = tuple._1
    rules(subject)