address comments

cloud-fan · cloud-fan · commit 124568b3eeb7 · 2016-04-28T13:15:20.000+08:00
diff --git a/core/src/main/scala/org/apache/spark/Accumulable.scala b/core/src/main/scala/org/apache/spark/Accumulable.scala
@@ -72,6 +72,7 @@ class Accumulable[R, T] private (
 
   def this(initialValue: R, param: AccumulableParam[R, T]) = this(initialValue, param, None)
 
+  val zero = param.zero(initialValue)
   private[spark] val newAcc = new LegacyAccumulatorWrapper(initialValue, param)
   newAcc.metadata = AccumulatorMetadata(id, name, countFailedValues)
   // Register the new accumulator in ctor, to follow the previous behaviour.
diff --git a/core/src/main/scala/org/apache/spark/NewAccumulator.scala b/core/src/main/scala/org/apache/spark/NewAccumulator.scala
@@ -34,18 +34,7 @@ private[spark] case class AccumulatorMetadata(
 
 /**
  * The base class for accumulators, that can accumulate inputs of type `IN`, and produce output of
- * type `OUT`.  Implementations must define following methods:
- *  - isZero:       tell if this accumulator is zero value or not. e.g. for a counter accumulator,
- *                  0 is zero value; for a list accumulator, Nil is zero value.
- *  - copyAndReset: create a new copy of this accumulator, which is zero value. i.e. call `isZero`
- *                  on the copy must return true.
- *  - add:          defines how to accumulate the inputs. e.g. it can be a simple `+=` for counter
- *                  accumulator
- *  - merge:        defines how to merge another accumulator of same type.
- *  - localValue:   defines how to produce the output by the current state of this accumulator.
- *
- * The implementations decide how to store intermediate values, e.g. a long field for a counter
- * accumulator, a double and a long field for a average accumulator(storing the sum and count).
+ * type `OUT`.
  */
 abstract class NewAccumulator[IN, OUT] extends Serializable {
   private[spark] var metadata: AccumulatorMetadata = _
@@ -63,6 +52,10 @@ abstract class NewAccumulator[IN, OUT] extends Serializable {
     sc.cleaner.foreach(_.registerAccumulatorForCleanup(this))
   }
 
+  /**
+   * Returns true if this accumulator has been registered.  Note that all accumulators must be
+   * registered before ues, or it will throw exception.
+   */
   final def isRegistered: Boolean =
     metadata != null && AccumulatorContext.originals.containsKey(metadata.id)
 
@@ -72,38 +65,69 @@ abstract class NewAccumulator[IN, OUT] extends Serializable {
     }
   }
 
+  /**
+   * Returns the id of this accumulator, can only be called after registration.
+   */
   final def id: Long = {
     assertMetadataNotNull()
     metadata.id
   }
 
+  /**
+   * Returns the name of this accumulator, can only be called after registration.
+   */
   final def name: Option[String] = {
     assertMetadataNotNull()
     metadata.name
   }
 
-  final def countFailedValues: Boolean = {
+  /**
+   * Whether to accumulate values from failed tasks. This is set to true for system and time
+   * metrics like serialization time or bytes spilled, and false for things with absolute values
+   * like number of input rows.  This should be used for internal metrics only.
+   */
+  private[spark] final def countFailedValues: Boolean = {
     assertMetadataNotNull()
     metadata.countFailedValues
   }
 
+  /**
+   * Creates an [[AccumulableInfo]] representation of this [[NewAccumulator]] with the provided
+   * values.
+   */
   private[spark] def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
     val isInternal = name.exists(_.startsWith(InternalAccumulator.METRICS_PREFIX))
     new AccumulableInfo(id, name, update, value, isInternal, countFailedValues)
   }
 
   final private[spark] def isAtDriverSide: Boolean = atDriverSide
 
+  /**
+   * Tells if this accumulator is zero value or not. e.g. for a counter accumulator, 0 is zero
+   * value; for a list accumulator, Nil is zero value.
+   */
   def isZero(): Boolean
 
+  /**
+   * Creates a new copy of this accumulator, which is zero value. i.e. call `isZero` on the copy
+   * must return true.
+   */
   def copyAndReset(): NewAccumulator[IN, OUT]
 
+  /**
+   * Takes the inputs and accumulates. e.g. it can be a simple `+=` for counter accumulator.
+   */
   def add(v: IN): Unit
 
-  def +=(v: IN): Unit = add(v)
-
+  /**
+   * Merges another same-type accumulator into this one and update its state, i.e. this should be
+   * merge-in-place.
+   */
   def merge(other: NewAccumulator[IN, OUT]): Unit
 
+  /**
+   * Access this accumulator's current value; only allowed on driver.
+   */
   final def value: OUT = {
     if (atDriverSide) {
       localValue
@@ -112,6 +136,12 @@ abstract class NewAccumulator[IN, OUT] extends Serializable {
     }
   }
 
+  /**
+   * Defines the current value of this accumulator.
+   *
+   * This is NOT the global value of the accumulator.  To get the global value after a
+   * completed operation on the dataset, call `value`.
+   */
   def localValue: OUT
 
   // Called by Java when serializing an object
@@ -328,6 +358,11 @@ class ListAccumulator[T] extends NewAccumulator[T, java.util.List[T]] {
   }
 
   override def localValue: java.util.List[T] = java.util.Collections.unmodifiableList(_list)
+
+  private[spark] def setValue(newValue: java.util.List[T]): Unit = {
+    _list.clear()
+    _list.addAll(newValue)
+  }
 }
 
 
diff --git a/core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala
@@ -51,7 +51,7 @@ class InternalAccumulatorSuite extends SparkFunSuite with LocalSparkContext {
     sc.addSparkListener(listener)
     // Have each task add 1 to the internal accumulator
     val rdd = sc.parallelize(1 to 100, numPartitions).mapPartitions { iter =>
-      TaskContext.get().taskMetrics().testAccum.get += 1L
+      TaskContext.get().taskMetrics().testAccum.get.add(1)
       iter
     }
     // Register asserts in job completion callback to avoid flakiness
@@ -87,17 +87,17 @@ class InternalAccumulatorSuite extends SparkFunSuite with LocalSparkContext {
     val rdd = sc.parallelize(1 to 100, numPartitions)
       .map { i => (i, i) }
       .mapPartitions { iter =>
-        TaskContext.get().taskMetrics().testAccum.get += 1L
+        TaskContext.get().taskMetrics().testAccum.get.add(1)
         iter
       }
       .reduceByKey { case (x, y) => x + y }
       .mapPartitions { iter =>
-        TaskContext.get().taskMetrics().testAccum.get += 10L
+        TaskContext.get().taskMetrics().testAccum.get.add(10)
         iter
       }
       .repartition(numPartitions * 2)
       .mapPartitions { iter =>
-        TaskContext.get().taskMetrics().testAccum.get += 100L
+        TaskContext.get().taskMetrics().testAccum.get.add(100)
         iter
       }
     // Register asserts in job completion callback to avoid flakiness
@@ -127,7 +127,7 @@ class InternalAccumulatorSuite extends SparkFunSuite with LocalSparkContext {
     // This should retry both stages in the scheduler. Note that we only want to fail the
     // first stage attempt because we want the stage to eventually succeed.
     val x = sc.parallelize(1 to 100, numPartitions)
-      .mapPartitions { iter => TaskContext.get().taskMetrics().testAccum.get += 1L; iter }
+      .mapPartitions { iter => TaskContext.get().taskMetrics().testAccum.get.add(1); iter }
       .groupBy(identity)
     val sid = x.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]].shuffleHandle.shuffleId
     val rdd = x.mapPartitionsWithIndex { case (i, iter) =>
diff --git a/core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala b/core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala
@@ -20,13 +20,11 @@ package org.apache.spark.executor
 import org.scalatest.Assertions
 
 import org.apache.spark._
-import org.apache.spark.scheduler.AccumulableInfo
-import org.apache.spark.storage.{BlockId, BlockStatus, StorageLevel, TestBlockId}
+import org.apache.spark.storage.{BlockStatus, StorageLevel, TestBlockId}
 
 
 class TaskMetricsSuite extends SparkFunSuite {
   import StorageLevel._
-  import TaskMetricsSuite._
 
   test("mutating values") {
     val tm = new TaskMetrics
@@ -202,8 +200,8 @@ class TaskMetricsSuite extends SparkFunSuite {
     tm.registerAccumulator(acc2)
     tm.registerAccumulator(acc3)
     tm.registerAccumulator(acc4)
-    acc1 += 1L
-    acc2 += 2L
+    acc1.add(1)
+    acc2.add(2)
     val newUpdates = tm.accumulators()
       .map(a => (a.id, a.asInstanceOf[NewAccumulator[Any, Any]])).toMap
     assert(newUpdates.contains(acc1.id))
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -676,7 +676,6 @@ object MimaExcludes {
         ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.util.collection.Spillable")
       ) ++ Seq(
         // SPARK-14654: New accumulator API
-        ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.Accumulable.zero"),
         ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ExceptionFailure$"),
         ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ExceptionFailure.apply"),
         ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ExceptionFailure.metrics"),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -106,7 +106,7 @@ private[sql] case class RDDScanExec(
     override val nodeName: String) extends LeafExecNode {
 
   private[sql] override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
@@ -147,7 +147,7 @@ private[sql] case class RowDataSourceScanExec(
   extends DataSourceScanExec with CodegenSupport {
 
   private[sql] override lazy val metrics =
-    Map("numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    Map("numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   val outputUnsafeRows = relation match {
     case r: HadoopFsRelation if r.fileFormat.isInstanceOf[ParquetSource] =>
@@ -216,7 +216,7 @@ private[sql] case class BatchedDataSourceScanExec(
   extends DataSourceScanExec with CodegenSupport {
 
   private[sql] override lazy val metrics =
-    Map("numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"),
+    Map("numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
       "scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "scan time"))
 
   protected override def doExecute(): RDD[InternalRow] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExpandExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExpandExec.scala
@@ -40,7 +40,7 @@ case class ExpandExec(
   extends UnaryExecNode with CodegenSupport {
 
   private[sql] override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   // The GroupExpressions can output data with arbitrary partitioning, so set it
   // as UNKNOWN partitioning
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
@@ -56,7 +56,7 @@ case class GenerateExec(
   extends UnaryExecNode {
 
   private[sql] override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   override def producedAttributes: AttributeSet = AttributeSet(output)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
@@ -31,7 +31,7 @@ private[sql] case class LocalTableScanExec(
     rows: Seq[InternalRow]) extends LeafExecNode {
 
   private[sql] override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   private val unsafeRows: Array[InternalRow] = {
     val proj = UnsafeProjection.create(output, output)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregateExec.scala
@@ -46,7 +46,7 @@ case class SortBasedAggregateExec(
       AttributeSet(aggregateBufferAttributes)
 
   override private[sql] lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -51,7 +51,7 @@ case class TungstenAggregate(
       aggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
 
   override private[sql] lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"),
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
     "peakMemory" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory"),
     "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"),
     "aggTime" -> SQLMetrics.createTimingMetric(sparkContext, "aggregate time"))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -103,7 +103,7 @@ case class FilterExec(condition: Expression, child: SparkPlan)
   }
 
   private[sql] override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   override def inputRDDs(): Seq[RDD[InternalRow]] = {
     child.asInstanceOf[CodegenSupport].inputRDDs()
@@ -229,7 +229,7 @@ case class SampleExec(
   override def output: Seq[Attribute] = child.output
 
   private[sql] override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   protected override def doExecute(): RDD[InternalRow] = {
     if (withReplacement) {
@@ -322,7 +322,7 @@ case class RangeExec(
   extends LeafExecNode with CodegenSupport {
 
   private[sql] override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   // output attributes should not affect the results
   override lazy val cleanArgs: Seq[Any] = Seq(start, step, numSlices, numElements)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -217,7 +217,7 @@ private[sql] case class InMemoryTableScanExec(
   extends LeafExecNode {
 
   private[sql] override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   override def output: Seq[Attribute] = attributes
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
@@ -38,10 +38,10 @@ case class BroadcastExchangeExec(
     child: SparkPlan) extends Exchange {
 
   override private[sql] lazy val metrics = Map(
-    "dataSize" -> SQLMetrics.createSumMetric(sparkContext, "data size (bytes)"),
-    "collectTime" -> SQLMetrics.createSumMetric(sparkContext, "time to collect (ms)"),
-    "buildTime" -> SQLMetrics.createSumMetric(sparkContext, "time to build (ms)"),
-    "broadcastTime" -> SQLMetrics.createSumMetric(sparkContext, "time to broadcast (ms)"))
+    "dataSize" -> SQLMetrics.createMetric(sparkContext, "data size (bytes)"),
+    "collectTime" -> SQLMetrics.createMetric(sparkContext, "time to collect (ms)"),
+    "buildTime" -> SQLMetrics.createMetric(sparkContext, "time to build (ms)"),
+    "broadcastTime" -> SQLMetrics.createMetric(sparkContext, "time to broadcast (ms)"))
 
   override def outputPartitioning: Partitioning = BroadcastPartitioning(mode)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -46,7 +46,7 @@ case class BroadcastHashJoinExec(
   extends BinaryExecNode with HashJoin with CodegenSupport {
 
   override private[sql] lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
@@ -35,7 +35,7 @@ case class BroadcastNestedLoopJoinExec(
     condition: Option[Expression]) extends BinaryExecNode {
 
   override private[sql] lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   /** BuildRight means the right relation <=> the broadcast relation. */
   private val (streamed, broadcast) = buildSide match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
@@ -83,7 +83,7 @@ case class CartesianProductExec(left: SparkPlan, right: SparkPlan) extends Binar
   override def output: Seq[Attribute] = left.output ++ right.output
 
   override private[sql] lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -40,7 +40,7 @@ case class ShuffledHashJoinExec(
   extends BinaryExecNode with HashJoin {
 
   override private[sql] lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"),
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
     "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
     "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -41,7 +41,7 @@ case class SortMergeJoinExec(
     right: SparkPlan) extends BinaryExecNode with CodegenSupport {
 
   override private[sql] lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createSumMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
   override def output: Seq[Attribute] = {
     joinType match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -43,7 +43,7 @@ class SQLMetric(val metricType: String, initValue: Long = 0L) extends NewAccumul
 
   override def add(v: Long): Unit = _value += v
 
-  override def +=(v: Long): Unit = _value += v
+  def +=(v: Long): Unit = _value += v
 
   override def localValue: Long = _value
 
@@ -64,7 +64,7 @@ private[sql] object SQLMetrics {
   private[sql] val SIZE_METRIC = "size"
   private[sql] val TIMING_METRIC = "timing"
 
-  def createSumMetric(sc: SparkContext, name: String): SQLMetric = {
+  def createMetric(sc: SparkContext, name: String): SQLMetric = {
     val acc = new SQLMetric(SUM_METRIC)
     acc.register(sc, name = Some(name), countFailedValues = true)
     acc
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala