/**
* An identifier for a partition in an RDD.
*/
trait Partition extends Serializable {
/**
* Get the partition's index within its parent RDD
*/
def index: Int
// A better default implementation of HashCode
override def hashCode(): Int = index
override def equals(other: Any): Boolean = super.equals(other)
}
/**
* An RDD partition used to recover checkpointed data.
*/
private[spark] class CheckpointRDDPartition(val index: Int) extends Partition
/**
* An RDD that recovers checkpointed data from storage.
*/
private[spark] abstract class CheckpointRDD[T: ClassTag](sc: SparkContext)
extends RDD[T](sc, Nil) {
// CheckpointRDD should not be checkpointed again
override def doCheckpoint(): Unit = { }
override def checkpoint(): Unit = { }
override def localCheckpoint(): this.type = this
// Note: There is a bug in MiMa that complains about `AbstractMethodProblem`s in the
// base [[org.apache.spark.rdd.RDD]] class if we do not override the following methods.
// scalastyle:off
protected override def getPartitions: Array[Partition] = ???
override def compute(p: Partition, tc: TaskContext): Iterator[T] = ???
// scalastyle:on
}
/**
* A dummy CheckpointRDD that exists to provide informative error messages during failures.
*
* This is simply a placeholder because the original checkpointed RDD is expected to be
* fully cached. Only if an executor fails or if the user explicitly unpersists the original
* RDD will Spark ever attempt to compute this CheckpointRDD. When this happens, however,
* we must provide an informative error message.
*
* @param sc the active SparkContext
* @param rddId the ID of the checkpointed RDD
* @param numPartitions the number of partitions in the checkpointed RDD
*/
private[spark] class LocalCheckpointRDD[T: ClassTag](
sc: SparkContext,
rddId: Int,
numPartitions: Int)
extends CheckpointRDD[T](sc) {
def this(rdd: RDD[T]) {
this(rdd.context, rdd.id, rdd.partitions.length)
}
protected override def getPartitions: Array[Partition] = {
(0 until numPartitions).toArray.map { i => new CheckpointRDDPartition(i) }
}
/**
* Throw an exception indicating that the relevant block is not found.
*
* This should only be called if the original RDD is explicitly unpersisted or if an
* executor is lost. Under normal circumstances, however, the original RDD (our child)
* is expected to be fully cached and so all partitions should already be computed and
* available in the block storage.
*/
override def compute(partition: Partition, context: TaskContext): Iterator[T] = {
throw new SparkException(
s"Checkpoint block ${RDDBlockId(rddId, partition.index)} not found! Either the executor " +
s"that originally checkpointed this partition is no longer alive, or the original RDD is " +
s"unpersisted. If this problem persists, you may consider using `rdd.checkpoint()` " +
s"instead, which is slower than local checkpointing but more fault-tolerant.")
}
}