fix rdd repartition

jiangxb1987 · jiangxb1987 · commit b14cd4bfc1b4 · 2018-07-02T20:19:44.000+08:00
diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -434,6 +434,17 @@ private[spark] class MapOutputTrackerMaster(
     }
   }
 
+  /** Unregister all map output information of the given shuffle. */
+  def unregisterAllMapOutput(shuffleId: Int) {
+    shuffleStatuses.get(shuffleId) match {
+      case Some(shuffleStatus) =>
+        shuffleStatus.removeOutputsByFilter(x => true)
+        incrementEpoch()
+      case None =>
+        throw new SparkException("unregisterMapOutput called for nonexistent shuffle ID")
+    }
+  }
+
   /** Unregister shuffle data */
   def unregisterShuffle(shuffleId: Int) {
     shuffleStatuses.remove(shuffleId).foreach { shuffleStatus =>
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -559,4 +559,16 @@ package object config {
       .intConf
       .checkValue(v => v > 0, "The value should be a positive integer.")
       .createWithDefault(2000)
+
+  private[spark] val RECOMPUTE_ALL_PARTITIONS_ON_REPARTITION_FAILURE =
+    ConfigBuilder("spark.shuffle.recomputeAllPartitionsOnRepartitionFailure")
+      .internal()
+      .doc("When perform repartition on an RDD, there may be data correctness issue if " +
+        "only a sub-set of partitions are recomputed on fetch failure and the input data " +
+        "sequence is not deterministic. Turn on this config to always recompute all the " +
+        "partitions before the repartition shuffle on fetch failure to ensure we always get " +
+        "correct result. Please note that turning on this config may increase the risk of job " +
+        "failing due to reach max consequence stage failure limit.")
+      .booleanConf
+      .createWithDefault(true)
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala
@@ -27,7 +27,8 @@ import org.apache.spark.{Partition, TaskContext}
 private[spark] class MapPartitionsRDD[U: ClassTag, T: ClassTag](
     var prev: RDD[T],
     f: (TaskContext, Int, Iterator[T]) => Iterator[U],  // (TaskContext, partition index, iterator)
-    preservesPartitioning: Boolean = false)
+    preservesPartitioning: Boolean = false,
+    recomputeOnFailure: Boolean = false)
   extends RDD[U](prev) {
 
   override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None
@@ -41,4 +42,6 @@ private[spark] class MapPartitionsRDD[U: ClassTag, T: ClassTag](
     super.clearDependencies()
     prev = null
   }
+
+  override def recomputeAllPartitionsOnFailure(): Boolean = recomputeOnFailure
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -452,6 +452,9 @@ abstract class RDD[T: ClassTag](
       /** Distributes elements evenly across output partitions, starting from a random partition. */
       val distributePartition = (index: Int, items: Iterator[T]) => {
         var position = new Random(hashing.byteswap32(index)).nextInt(numPartitions)
+        // TODO Enable insert a local sort before shuffle to make input data sequence
+        // deterministic, thus the config
+        // "spark.shuffle.recomputeAllPartitionsOnRepartitionFailure" can be disabled.
         items.map { t =>
           // Note that the hash code of the key will just be the key itself. The HashPartitioner
           // will mod it with the number of total partitions.
@@ -461,9 +464,12 @@ abstract class RDD[T: ClassTag](
       } : Iterator[(Int, T)]
 
       // include a shuffle step so that our upstream tasks are still distributed
+      val recomputeOnFailure =
+        conf.getBoolean("spark.shuffle.recomputeAllPartitionsOnRepartitionFailure", true)
       new CoalescedRDD(
-        new ShuffledRDD[Int, T, T](mapPartitionsWithIndex(distributePartition),
-        new HashPartitioner(numPartitions)),
+        new ShuffledRDD[Int, T, T](
+          mapPartitionsWithIndex(distributePartition, recomputeOnFailure),
+          new HashPartitioner(numPartitions)),
         numPartitions,
         partitionCoalescer).values
     } else {
@@ -837,15 +843,21 @@ abstract class RDD[T: ClassTag](
    *
    * `preservesPartitioning` indicates whether the input function preserves the partitioner, which
    * should be `false` unless this is a pair RDD and the input function doesn't modify the keys.
+   *
+   * `recomputeOnFailure` indicates whether to recompute on all the partitions on failure recovery,
+   * which should be `false` unless the output is not sorted or not sortable, and the output is
+   * repartitioned.
    */
   def mapPartitionsWithIndex[U: ClassTag](
       f: (Int, Iterator[T]) => Iterator[U],
-      preservesPartitioning: Boolean = false): RDD[U] = withScope {
+      preservesPartitioning: Boolean = false,
+      recomputeOnFailure: Boolean = false): RDD[U] = withScope {
     val cleanedF = sc.clean(f)
     new MapPartitionsRDD(
       this,
       (context: TaskContext, index: Int, iter: Iterator[T]) => cleanedF(index, iter),
-      preservesPartitioning)
+      preservesPartitioning,
+      recomputeOnFailure)
   }
 
   /**
@@ -1839,6 +1851,18 @@ abstract class RDD[T: ClassTag](
   def toJavaRDD() : JavaRDD[T] = {
     new JavaRDD(this)(elementClassTag)
   }
+
+  /**
+   * Whether or not the RDD is required to recompute all partitions on failure. Repartition on an
+   * RDD performs in a round-robin manner, thus there may be data correctness issue if only a
+   * sub-set of partitions are recomputed on failure and the input data sequence is not
+   * deterministic. Please refer to SPARK-23207 and SPARK-23243 for related discussion.
+   *
+   * Require to recompute all partitions on failure if repartition operation is called on this RDD
+   * and the result sequence of this RDD is not deterministic (or the data type of the output of
+   * this RDD is not sortable).
+   */
+  private[spark] def recomputeAllPartitionsOnFailure(): Boolean = false
 }
 
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1323,17 +1323,17 @@ class DAGScheduler(
         }
 
       case FetchFailed(bmAddress, shuffleId, mapId, _, failureMessage) =>
-        val failedStage = stageIdToStage(task.stageId)
+        val failedStage = stage
         val mapStage = shuffleIdToMapStage(shuffleId)
 
         if (failedStage.latestInfo.attemptNumber != task.stageAttemptId) {
           logInfo(s"Ignoring fetch failure from $task as it's from $failedStage attempt" +
             s" ${task.stageAttemptId} and there is a more recent attempt for that stage " +
             s"(attempt ${failedStage.latestInfo.attemptNumber}) running")
         } else {
-          failedStage.fetchFailedAttemptIds.add(task.stageAttemptId)
+          failedStage.failedAttemptIds.add(task.stageAttemptId)
           val shouldAbortStage =
-            failedStage.fetchFailedAttemptIds.size >= maxConsecutiveStageAttempts ||
+            failedStage.failedAttemptIds.size >= maxConsecutiveStageAttempts ||
             disallowStageRetryForTest
 
           // It is likely that we receive multiple FetchFailed for a single stage (because we have
@@ -1386,8 +1386,12 @@ class DAGScheduler(
               )
             }
           }
-          // Mark the map whose fetch failed as broken in the map stage
-          if (mapId != -1) {
+
+          if (mapStage.rdd.recomputeAllPartitionsOnFailure()) {
+            // Mark all the map as broken in the map stage, to ensure recompute all the partitions
+            // on resubmitted stage attempt.
+            mapOutputTracker.unregisterAllMapOutput(shuffleId)
+          } else if (mapId != -1) {
             mapOutputTracker.unregisterMapOutput(shuffleId, mapId, bmAddress)
           }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -82,15 +82,15 @@ private[scheduler] abstract class Stage(
   private var _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId)
 
   /**
-   * Set of stage attempt IDs that have failed with a FetchFailure. We keep track of these
-   * failures in order to avoid endless retries if a stage keeps failing with a FetchFailure.
+   * Set of stage attempt IDs that have failed. We keep track of these failures in order to avoid
+   * endless retries if a stage keeps failing.
    * We keep track of each attempt ID that has failed to avoid recording duplicate failures if
    * multiple tasks from the same stage attempt fail (SPARK-5945).
    */
-  val fetchFailedAttemptIds = new HashSet[Int]
+  val failedAttemptIds = new HashSet[Int]
 
   private[scheduler] def clearFailures() : Unit = {
-    fetchFailedAttemptIds.clear()
+    failedAttemptIds.clear()
   }
 
   /** Creates a new attempt for this stage by creating a new StageInfo with a new attempt ID. */