repartition算子-Spark shuffle算子之

最新推荐文章于 2022-08-05 11:11:24 发布
江湖峰哥
最新推荐文章于 2022-08-05 11:11:24 发布
阅读量1.1k
点赞数
CC 4.0 BY-SA版权
分类专栏： Spark 文章标签： spark
本文链接：https://blog.youkuaiyun.com/querydata_boke/article/details/105272832
Spark 专栏收录该内容
9 篇文章
订阅专栏
该算子会对父RDD重新分区,我们只需要传入分区数即可
通过源码可以得知repartition实际上调用的是
coalesce(numPartitions, shuffle = true)方法
源码片段如下:
def repartition(numPartitions: Int)
(implicit ord: Ordering[T] = null): RDD[T] = withScope {
    coalesce(numPartitions, shuffle = true)
  }
shuffle = true表示无论子RDD重新分区数大于或者等于或者小于父RDD的分区数,
重新分区的过程中一定会产生shuffle,通过下面的代码可以验证这一现象
1.java api版本
	代码示例:
	public static void main(String[] args) {
	SparkConf conf = new SparkConf().setMaster("local").setAppName("test");
	JavaSparkContext jsc = new JavaSparkContext(conf);
	JavaRDD<String> rdd1 = jsc.parallelize(Arrays.asList(
	    "spark1", "spark2", "spark3",
	    "spark4", "spark5", "spark6",
	    "spark7", "spark8", "spark9"), 
	    3);
	JavaRDD<String> rdd2 = rdd1.mapPartitionsWithIndex(
	(Function2<Integer, Iterator<String>, Iterator<String>>) (index, iterator) -> {
	 System.out.println();
	 List<String> list = new ArrayList<>();
	 while (iterator.hasNext()) {
	  String next = iterator.next();
	  System.out.println("父rdd1 partition index = 【" + index + "】, value = " + next);
	  list.add("父rdd1 partition index = 【" + index + "】, value = " + next);
	 }
	 return list.iterator();
	}, true);
	//rdd2重新分区,比父RDD分区少
	JavaRDD<String> rdd3 = rdd2.repartition(2);
	JavaRDD<String> rdd4 = rdd3.mapPartitionsWithIndex(
	(Function2<Integer, Iterator<String>, Iterator<String>>) (index, iterator) -> {
	 System.out.println("当前分区号为 【"+index+"】");
	 while (iterator.hasNext()) {
	  System.out.println("子rdd3 partition index = 【" + index + "】, value = " + iterator.next());
	 }
	 return iterator;
	}, true);
	rdd4.collect();
	jsc.stop();
	}
	运行结果:
	父rdd1 partition index = 【0】, value = spark1
	父rdd1 partition index = 【0】, value = spark2
	父rdd1 partition index = 【0】, value = spark3
	
	父rdd1 partition index = 【1】, value = spark4
	父rdd1 partition index = 【1】, value = spark5
	父rdd1 partition index = 【1】, value = spark6
	
	父rdd1 partition index = 【2】, value = spark7
	父rdd1 partition index = 【2】, value = spark8
	父rdd1 partition index = 【2】, value = spark9
	当前分区号为 【0】
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark1
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark3
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark4
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark6
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark7
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark9
	当前分区号为 【1】
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark2
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark5
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark8
	结论:父RDD同一个分区的数据分发到子RDD不同的分区,可见重新分区过程中产生了Shuffle
	#================================================================================
	//rdd2重新分区,和父RDD分区相等,运行结果
	当前分区号为 【0】
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark3
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark6
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark8
	当前分区号为 【1】
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark1
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark4
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark9
	当前分区号为 【2】
	子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【0】, value = spark2
	子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【1】, value = spark5
	子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【2】, value = spark7
	#================================================================================
	//rdd2重新分区,大于父RDD分区数,分区数为4,多次运行结果
	当前分区号为 【2】 #这里产生了一个空分区
	当前分区号为 【0】
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark2
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark5
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark8
	当前分区号为 【1】
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark3
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark6
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark9
	当前分区号为 【3】
	子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【0】, value = spark1
	子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【1】, value = spark4
	子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【2】, value = spark7
	#================================================================================
	//rdd2重新分区,大于父RDD分区数,分区数为5,多次运行结果
	当前分区号为 【0】
	子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark8
	当前分区号为 【1】
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark1
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark4
	子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark9
	当前分区号为 【2】
	子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【0】, value = spark2
	子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【1】, value = spark5
	当前分区号为 【3】
	子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【0】, value = spark3
	子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【1】, value = spark6
	当前分区号为 【4】
	子rdd3 partition index = 【4】, value = 父rdd1 partition index = 【2】, value = spark7
2.scala api版本
	def main(args: Array[String]): Unit = {
	 val sparkSession = SparkSession.builder
	  .master("local")
	  .appName("appName")
	  .getOrCreate()
	 val sc = sparkSession.sparkContext
	 val rdd1: RDD[String] = sc.parallelize(List(
	  "spark1", "spark2", "spark3",
	  "spark4", "spark5", "spark6",
	  "spark7", "spark8", "spark9"),
	  3)
	 val rdd2: RDD[String] = rdd1.mapPartitionsWithIndex {
	  (index, iter) => {
	   println()
	   var result = List[String]()
	   while (iter.hasNext) {
	    val str: String = iter.next()
	    println("父rdd1 partition index = 【" + index + "】, value = " + str);
	    result = result :+ "父rdd1 partition index = 【" + index + "】, value = " + str
	   }
	   result.iterator
	  }
	 }
	 val rdd3: RDD[String] = rdd2.repartition(2)
	 val rdd4: RDD[String] = rdd3.mapPartitionsWithIndex {
	  (index, iter) => {
	   println()
	   while (iter.hasNext) {
	    val str: String = iter.next()
	    println("子rdd2 partition index = 【" + index + "】, value = " + str);
	   }
	   iter
	  }
	 }
	 rdd4.collect()
	 sc.stop()
	}
	运行结果:
	父rdd1 partition index = 【0】, value = spark1
	父rdd1 partition index = 【0】, value = spark2
	父rdd1 partition index = 【0】, value = spark3
	
	父rdd1 partition index = 【1】, value = spark4
	父rdd1 partition index = 【1】, value = spark5
	父rdd1 partition index = 【1】, value = spark6
	
	父rdd1 partition index = 【2】, value = spark7
	父rdd1 partition index = 【2】, value = spark8
	父rdd1 partition index = 【2】, value = spark9
	
	当前分区号为 【0】
	子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark1
	子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark3
	子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark4
	子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark6
	子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark7
	子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark9
	
	当前分区号为 【1】
	子rdd2 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark2
	子rdd2 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark5
	子rdd2 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark8
	#================================================================================
	//rdd2重新分区,大于父RDD分区数/等于父RDD分区数运行结果和上面一样
结论:无论子RDD重新分区数大于或者等于或者小于父RDD的分区数,重新分区的过程中一定会产生shuffle