[SPARK-18120 ][SQL] Call QueryExecutionListener callback methods for DataFrameWriter methods

Salil Surendran · Salil Surendran · commit 751ded07c7f4 · 2017-01-20T12:08:12.000-08:00
QueryExecutionListener has two methods onSuccess() and onFailure() that takes a QueryExecution object as a parameter that gets called when a query is executed. It gets called for several of the DataSet methods like take, head, first, collect etc. but doesn't get called for any of the DataFrameWriter methods like saveAsTable, save etc. This commit fixes this issue and makes calls to these two methods from DataFrameWriter output methods.
Also, added a new property "spark.sql.queryExecutionListeners" that can be used to specify instances of QueryExecutionListeners that should be attached to the SparkSession when the spark application starts up. Testing was done using unit tests.
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
@@ -1256,8 +1256,9 @@ Configuration of in-memory caching can be done using the `setConf` method on `Sp
 
 ## Other Configuration Options
 
-The following options can also be used to tune the performance of query execution. It is possible
-that these options will be deprecated in future release as more optimizations are performed automatically.
+The following options can also be used to tune the performance of query execution and attaching
+query execution listeners. It is possible that these options will be deprecated in future release as
+more optimizations are performed automatically.
 
 <table class="table">
   <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -1304,6 +1305,16 @@ that these options will be deprecated in future release as more optimizations ar
       Configures the number of partitions to use when shuffling data for joins or aggregations.
     </td>
   </tr>
+  <tr>
+      <td><code>spark.sql.queryExecutionListeners</code></td>
+      <td></td>
+      <td>
+        A comma-separated list of classes that implement QueryExecutionListener. When creating a SparkSession,
+        instances of these listeners will be added to it. These classes needs to have a zero-argument
+        constructor. If the specified class can't be found or the class specified doesn't have a valid
+        constructor the SparkSession creation will fail with an exception.
+      </td>
+    </tr>
 </table>
 
 # Distributed SQL Engine
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -128,7 +128,13 @@ object MimaExcludes {
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.startOffset"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.endOffset"),
       ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.this"),
-      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query")
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query"),
+
+      // [SPARK-18120 ][SQL] Call QueryExecutionListener callback methods for DataFrameWriter methods
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onSuccess"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onFailure"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onSuccess"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onFailure")      
     )
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -26,10 +26,12 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogRelation, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
+import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation}
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.{OutputParams, QueryExecutionListener}
 
 /**
  * Interface used to write a [[Dataset]] to external storage systems (e.g. file systems,
@@ -189,6 +191,33 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
     this
   }
 
+  /**
+   * Executes the query and calls the {@link org.apache.spark.sql.util.QueryExecutionListener}
+   * methods.
+   *
+   * @param funcName A identifier for the method executing the query
+   * @param qe the @see [[QueryExecution]] object associated with the
+   *        query
+   * @param outputParams The output parameters useful for query analysis
+   * @param action the function that executes the query after which the listener methods gets
+   *               called.
+   */
+  private def executeAndCallQEListener(
+                                        funcName: String,
+                                        qe: QueryExecution,
+                                        outputParams: OutputParams)(action: => Unit) = {
+    try {
+      val start = System.nanoTime()
+      action
+      val end = System.nanoTime()
+      df.sparkSession.listenerManager.onSuccess(funcName, qe, end - start, Some(outputParams))
+    } catch {
+      case e: Exception =>
+        df.sparkSession.listenerManager.onFailure(funcName, qe, e, Some(outputParams))
+        throw e
+    }
+  }
+
   /**
    * Saves the content of the `DataFrame` at the specified path.
    *
@@ -218,7 +247,17 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       bucketSpec = getBucketSpec,
       options = extraOptions.toMap)
 
-    dataSource.write(mode, df)
+    val destination = source match {
+      case "jdbc" => extraOptions.get("dbtable")
+      case _ => extraOptions.get("path")
+    }
+
+    executeAndCallQEListener(
+      "save",
+      df.queryExecution,
+      OutputParams(source, destination, extraOptions.toMap)) {
+      dataSource.write(mode, df)
+    }
   }
 
   /**
@@ -244,6 +283,11 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    *
    * Because it inserts data to an existing table, format or options will be ignored.
    *
+   * Calls the callback methods of @see[[QueryExecutionListener]] after query execution with
+   * @see[[OutputParams]] having datasourceType set as the string parameter passed to the
+   * @see[[DataFrameWriter#format]] method and destination set as the name of the table into which
+   * data is being inserted into.
+   *
    * @since 1.4.0
    */
   def insertInto(tableName: String): Unit = {
@@ -261,13 +305,19 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       )
     }
 
-    df.sparkSession.sessionState.executePlan(
+    val qe = df.sparkSession.sessionState.executePlan(
       InsertIntoTable(
         table = UnresolvedRelation(tableIdent),
         partition = Map.empty[String, Option[String]],
         child = df.logicalPlan,
         overwrite = mode == SaveMode.Overwrite,
-        ifNotExists = false)).toRdd
+        ifNotExists = false))
+    executeAndCallQEListener(
+      "insertInto",
+      qe,
+      new OutputParams(source, Some(tableIdent.unquotedString), extraOptions.toMap)) {
+        qe.toRdd
+    }
   }
 
   private def normalizedParCols: Option[Seq[String]] = partitioningColumns.map { cols =>
@@ -324,7 +374,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
 
   private def assertNotPartitioned(operation: String): Unit = {
     if (partitioningColumns.isDefined) {
-      throw new AnalysisException( s"'$operation' does not support partitioning")
+      throw new AnalysisException(s"'$operation' does not support partitioning")
     }
   }
 
@@ -359,6 +409,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL
    * specific format.
    *
+   * Calls the callback methods of @see[[QueryExecutionListener]] after query execution with a
+   * @see[[OutputParams]] object having datasourceType set as the string parameter passed to the
+   * @see[[DataFrameWriter#format]] and destination set as the name of the table being
+   * written to
    * @since 1.4.0
    */
   def saveAsTable(tableName: String): Unit = {
@@ -428,8 +482,14 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       partitionColumnNames = partitioningColumns.getOrElse(Nil),
       bucketSpec = getBucketSpec
     )
-    df.sparkSession.sessionState.executePlan(
-      CreateTable(tableDesc, mode, Some(df.logicalPlan))).toRdd
+    val qe = df.sparkSession.sessionState.executePlan(
+      CreateTable(tableDesc, mode, Some(df.logicalPlan)))
+    executeAndCallQEListener(
+      "saveAsTable",
+      qe,
+      new OutputParams(source, Some(tableIdent.unquotedString), extraOptions.toMap)) {
+      qe.toRdd
+    }
   }
 
   /**
@@ -493,6 +553,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * indicates a timestamp format. Custom date formats follow the formats at
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
    * </ul>
+   * Calls the callback methods in @see[[QueryExecutionListener]] methods after query execution with
+   * @see[[OutputParams]] having datasourceType set as string constant "json" and
+   * destination set as the path to which the data is written
    *
    * @since 1.4.0
    */
@@ -514,6 +577,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * shorten names(none, `snappy`, `gzip`, and `lzo`). This will override
    * `spark.sql.parquet.compression.codec`.</li>
    * </ul>
+   * Calls the callback methods in @see[[QueryExecutionListener]] methods after query execution with
+   * @see[[OutputParams]] having datasourceType set as string constant "parquet" and
+   * destination set as the path to which the data is written
    *
    * @since 1.4.0
    */
@@ -534,6 +600,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * one of the known case-insensitive shorten names(`none`, `snappy`, `zlib`, and `lzo`).
    * This will override `orc.compress`.</li>
    * </ul>
+   * Calls the callback methods in @see[[QueryExecutionListener]] methods after query execution with
+   * @see[[OutputParams]] having datasourceType set as string constant "orc" and
+   * destination set as the path to which the data is written
    *
    * @since 1.5.0
    * @note Currently, this method can only be used after enabling Hive support
@@ -560,6 +629,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`,
    * `snappy` and `deflate`). </li>
    * </ul>
+   * Calls the callback methods in e@see[[QueryExecutionListener]] methods after query execution
+   * with @see[[OutputParams]] having datasourceType set as string constant "text" and
+   * destination set as the path to which the data is written
    *
    * @since 1.6.0
    */
@@ -599,6 +671,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * indicates a timestamp format. Custom date formats follow the formats at
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
    * </ul>
+   * Calls the callback methods in @see[[QueryExecutionListener]] methods after query execution with
+   * @see[[OutputParams]] having datasourceType set as string constant "csv" and
+   * destination set as the path to which the data is written
    *
    * @since 2.0.0
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -40,12 +40,12 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Range}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.ui.SQLListener
-import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState}
+import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState, SQLConf}
 import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.streaming._
 import org.apache.spark.sql.types.{DataType, LongType, StructType}
-import org.apache.spark.sql.util.ExecutionListenerManager
+import org.apache.spark.sql.util.{ExecutionListenerManager, QueryExecutionListener}
 import org.apache.spark.util.Utils
 
 
@@ -876,6 +876,9 @@ object SparkSession {
         }
         session = new SparkSession(sparkContext)
         options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) }
+        for (qeListener <- createQueryExecutionListeners(session.sparkContext.getConf)) {
+          session.listenerManager.register(qeListener)
+        }
         defaultSession.set(session)
 
         // Register a successfully instantiated context to the singleton. This should be at the
@@ -893,6 +896,12 @@ object SparkSession {
     }
   }
 
+  private def createQueryExecutionListeners(conf: SparkConf): Seq[QueryExecutionListener] = {
+    conf.get(SQLConf.QUERY_EXECUTION_LISTENERS)
+      .map(Utils.classForName(_))
+      .map(_.newInstance().asInstanceOf[QueryExecutionListener])
+  }
+
   /**
    * Creates a [[SparkSession.Builder]] for constructing a [[SparkSession]].
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -655,6 +655,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val QUERY_EXECUTION_LISTENERS =
+    ConfigBuilder("spark.sql.queryExecutionListeners")
+      .doc("QueryExecutionListeners to be attached to the SparkSession")
+      .stringConf
+      .toSequence
+      .createWithDefault(Nil)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
@@ -44,27 +44,49 @@ trait QueryExecutionListener {
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param durationNs the execution time for this query in nanoseconds.
-   *
-   * @note This can be invoked by multiple different threads.
+   * @param outputParams The output parameters in case the method is invoked as a result of a
+   *                     write operation. In case of a read will be @see[[None]]
    */
   @DeveloperApi
-  def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit
-
+  def onSuccess(
+      funcName: String,
+      qe: QueryExecution,
+      durationNs: Long,
+      outputParams: Option[OutputParams]): Unit
   /**
    * A callback function that will be called when a query execution failed.
    *
    * @param funcName the name of the action that triggered this query.
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param exception the exception that failed this query.
+   * @param outputParams The output parameters in case the method is invoked as a result of a
+   *                     write operation. In case of a read will be @see[[None]]
    *
    * @note This can be invoked by multiple different threads.
    */
   @DeveloperApi
-  def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit
+  def onFailure(
+      funcName: String,
+      qe: QueryExecution,
+      exception: Exception,
+      outputParams: Option[OutputParams]): Unit
 }
 
-
+/**
+ * Contains extra information useful for query analysis passed on from the methods in
+ * @see[[org.apache.spark.sql.DataFrameWriter]] while writing to a datasource
+ * @param datasourceType type of data source written to like csv, parquet, json, hive, jdbc etc.
+ * @param destination path or table name written to
+ * @param options the map containing the output options for the underlying datasource
+ *                specified by using the @see [[org.apache.spark.sql.DataFrameWriter#option]] method
+ * @param writeParams will contain any extra information that the write method wants to provide
+ */
+case class OutputParams(
+    datasourceType: String,
+    destination: Option[String],
+    options: Map[String, String],
+    writeParams: Map[String, String] = Map.empty)
 /**
  * :: Experimental ::
  *
@@ -98,18 +120,26 @@ class ExecutionListenerManager private[sql] () extends Logging {
     listeners.clear()
   }
 
-  private[sql] def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
+  private[sql] def onSuccess(
+      funcName: String,
+      qe: QueryExecution,
+      duration: Long,
+      outputParams: Option[OutputParams] = None): Unit = {
     readLock {
       withErrorHandling { listener =>
-        listener.onSuccess(funcName, qe, duration)
+        listener.onSuccess(funcName, qe, duration, outputParams)
       }
     }
   }
 
-  private[sql] def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {
+  private[sql] def onFailure(
+      funcName: String,
+      qe: QueryExecution,
+      exception: Exception,
+      outputParams: Option[OutputParams] = None): Unit = {
     readLock {
       withErrorHandling { listener =>
-        listener.onFailure(funcName, qe, exception)
+        listener.onFailure(funcName, qe, exception, outputParams)
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSQLQueryExecutionListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSQLQueryExecutionListenerSuite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,13 @@ object MimaExcludes {`
`128`	`128`	`ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.startOffset"),`
`129`	`129`	`ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.endOffset"),`
`130`	`130`	`ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.this"),`
`131`		`- ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query")`
	`131`	`+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query"),`
	`132`	`+`
	`133`	`+ // [SPARK-18120 ][SQL] Call QueryExecutionListener callback methods for DataFrameWriter methods`
	`134`	`+ ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onSuccess"),`
	`135`	`+ ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onFailure"),`
	`136`	`+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onSuccess"),`
	`137`	`+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onFailure")`
`132`	`138`	`)`
`133`	`139`	`}`
`134`	`140`