该算子会对父RDD重新分区,我们只需要传入分区数即可
通过源码可以得知repartition实际上调用的是
coalesce(numPartitions, shuffle = true)方法
源码片段如下:
def repartition(numPartitions: Int)
(implicit ord: Ordering[T] = null): RDD[T] = withScope {
coalesce(numPartitions, shuffle = true)
}
shuffle = true表示无论子RDD重新分区数大于或者等于或者小于父RDD的分区数,
重新分区的过程中一定会产生shuffle,通过下面的代码可以验证这一现象
1.java api版本
代码示例:
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("test");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = jsc.parallelize(Arrays.asList(
"spark1", "spark2", "spark3",
"spark4", "spark5", "spark6",
"spark7", "spark8", "spark9"),
3);
JavaRDD<String> rdd2 = rdd1.mapPartitionsWithIndex(
(Function2<Integer, Iterator<String>, Iterator<String>>) (index, iterator) -> {
System.out.println();
List<String> list = new ArrayList<>();
while (iterator.hasNext()) {
String next = iterator.next();
System.out.println("父rdd1 partition index = 【" + index + "】, value = " + next);
list.add("父rdd1 partition index = 【" + index + "】, value = " + next);
}
return list.iterator();
}, true);
//rdd2重新分区,比父RDD分区少
JavaRDD<String> rdd3 = rdd2.repartition(2);
JavaRDD<String> rdd4 = rdd3.mapPartitionsWithIndex(
(Function2<Integer, Iterator<String>, Iterator<String>>) (index, iterator) -> {
System.out.println("当前分区号为 【"+index+"】");
while (iterator.hasNext()) {
System.out.println("子rdd3 partition index = 【" + index + "】, value = " + iterator.next());
}
return iterator;
}, true);
rdd4.collect();
jsc.stop();
}
运行结果:
父rdd1 partition index = 【0】, value = spark1
父rdd1 partition index = 【0】, value = spark2
父rdd1 partition index = 【0】, value = spark3
父rdd1 partition index = 【1】, value = spark4
父rdd1 partition index = 【1】, value = spark5
父rdd1 partition index = 【1】, value = spark6
父rdd1 partition index = 【2】, value = spark7
父rdd1 partition index = 【2】, value = spark8
父rdd1 partition index = 【2】, value = spark9
当前分区号为 【0】
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark1
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark3
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark4
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark6
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark7
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark9
当前分区号为 【1】
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark2
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark5
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark8
结论:父RDD同一个分区的数据分发到子RDD不同的分区,可见重新分区过程中产生了Shuffle
#================================================================================
//rdd2重新分区,和父RDD分区相等,运行结果
当前分区号为 【0】
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark3
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark6
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark8
当前分区号为 【1】
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark1
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark4
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark9
当前分区号为 【2】
子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【0】, value = spark2
子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【1】, value = spark5
子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【2】, value = spark7
#================================================================================
//rdd2重新分区,大于父RDD分区数,分区数为4,多次运行结果
当前分区号为 【2】 #这里产生了一个空分区
当前分区号为 【0】
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark2
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark5
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark8
当前分区号为 【1】
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark3
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark6
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark9
当前分区号为 【3】
子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【0】, value = spark1
子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【1】, value = spark4
子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【2】, value = spark7
#================================================================================
//rdd2重新分区,大于父RDD分区数,分区数为5,多次运行结果
当前分区号为 【0】
子rdd3 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark8
当前分区号为 【1】
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark1
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark4
子rdd3 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark9
当前分区号为 【2】
子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【0】, value = spark2
子rdd3 partition index = 【2】, value = 父rdd1 partition index = 【1】, value = spark5
当前分区号为 【3】
子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【0】, value = spark3
子rdd3 partition index = 【3】, value = 父rdd1 partition index = 【1】, value = spark6
当前分区号为 【4】
子rdd3 partition index = 【4】, value = 父rdd1 partition index = 【2】, value = spark7
2.scala api版本
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder
.master("local")
.appName("appName")
.getOrCreate()
val sc = sparkSession.sparkContext
val rdd1: RDD[String] = sc.parallelize(List(
"spark1", "spark2", "spark3",
"spark4", "spark5", "spark6",
"spark7", "spark8", "spark9"),
3)
val rdd2: RDD[String] = rdd1.mapPartitionsWithIndex {
(index, iter) => {
println()
var result = List[String]()
while (iter.hasNext) {
val str: String = iter.next()
println("父rdd1 partition index = 【" + index + "】, value = " + str);
result = result :+ "父rdd1 partition index = 【" + index + "】, value = " + str
}
result.iterator
}
}
val rdd3: RDD[String] = rdd2.repartition(2)
val rdd4: RDD[String] = rdd3.mapPartitionsWithIndex {
(index, iter) => {
println()
while (iter.hasNext) {
val str: String = iter.next()
println("子rdd2 partition index = 【" + index + "】, value = " + str);
}
iter
}
}
rdd4.collect()
sc.stop()
}
运行结果:
父rdd1 partition index = 【0】, value = spark1
父rdd1 partition index = 【0】, value = spark2
父rdd1 partition index = 【0】, value = spark3
父rdd1 partition index = 【1】, value = spark4
父rdd1 partition index = 【1】, value = spark5
父rdd1 partition index = 【1】, value = spark6
父rdd1 partition index = 【2】, value = spark7
父rdd1 partition index = 【2】, value = spark8
父rdd1 partition index = 【2】, value = spark9
当前分区号为 【0】
子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark1
子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【0】, value = spark3
子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark4
子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【1】, value = spark6
子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark7
子rdd2 partition index = 【0】, value = 父rdd1 partition index = 【2】, value = spark9
当前分区号为 【1】
子rdd2 partition index = 【1】, value = 父rdd1 partition index = 【0】, value = spark2
子rdd2 partition index = 【1】, value = 父rdd1 partition index = 【1】, value = spark5
子rdd2 partition index = 【1】, value = 父rdd1 partition index = 【2】, value = spark8
#================================================================================
//rdd2重新分区,大于父RDD分区数/等于父RDD分区数运行结果和上面一样
结论:无论子RDD重新分区数大于或者等于或者小于父RDD的分区数,重新分区的过程中一定会产生shuffle
repartition算子-Spark shuffle算子之
最新推荐文章于 2022-08-05 11:11:24 发布