randomSplit():
def randomSplit(weights: Array[Double], seed: Long = Utils.random.nextLong): Array[RDD[T]]
该函数根据weights权重,将一个RDD切分成多个RDD
该权重参数为一个Double数组,第二个参数为random的种子,基本可忽略
randomSplit的结果是一个RDD数组
权重的总和加起来为1
scala> val data = sc.makeRDD(1 to 10 ,10) data: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[10] at makeRDD at <console>:24 scala> data.collect res4: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) scala> val split_data = data.randomSplit(Array(0.1, 0.2, 0.3, 0.4)) split_data: Array[org.apache.spark.rdd.RDD[Int]] = Array(MapPartitionsRDD[11] at randomSplit at <console>:26, MapPartitionsRDD[12] at randomSplit at <console>:26, MapPartitionsRDD[13] at randomSplit at <console>:26, MapPartitionsRDD[14] at randomSplit at <console>:26) scala> split_data.size res5: Int = 4 scala> split_data(0).collect res6: Array[Int] = Array(3) scala> split_data(1).collect res7: Array[Int] = Array() scala> split_data(2).collect res8: Array[Int] = Array(9) scala> split_data(3).collect res9: Array[Int] = Array(1, 2, 4, 5, 6, 7, 8, 10) scala> val splitRDD = data.randomSplit(Array(0.5, 0.5)) splitRDD: Array[org.apache.spark.rdd.RDD[Int]] = Array(MapPartitionsRDD[15] at randomSplit at <console>:26, MapPartitionsRDD[16] at randomSplit at <console>:26) scala> splitRDD.size res10: Int = 2 scala> splitRDD(0).collect res11: Array[Int] = Array(3, 4, 5, 7, 9) scala> splitRDD(1).collect res12: Array[Int] = Array(1, 2, 6, 8, 10)
glom()
def glom(): RDD[Array[T]]
将RDD中每一个分区中类型为T的元素转换成Array[T],这样每一个分区就只有一个数组元素
scala> val glomRDD = sc.makeRDD(1 to 10, 4) glomRDD: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at makeRDD at <console>:24 scala> glomRDD.partitions.size res13: Int = 4 scala> glomRDD.glom().collect res14: Array[Array[Int]] = Array(Array(1, 2), Array(3, 4, 5), Array(6, 7), Array(8, 9, 10))