[SPARK-14994][SQL] Remove execution hive from HiveSessionState

rxin · rxin · commit 045865dd748a · 2016-04-28T16:52:54.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.catalog.{ExternalCatalog, InMemoryCatalog}
 import org.apache.spark.sql.execution.CacheManager
 import org.apache.spark.sql.execution.ui.SQLListener
+import org.apache.spark.util.MutableURLClassLoader
 
 
 /**
@@ -44,4 +45,21 @@ private[sql] class SharedState(val sparkContext: SparkContext) {
    */
   lazy val externalCatalog: ExternalCatalog = new InMemoryCatalog
 
+  /**
+   * A classloader used to load all user-added jar.
+   */
+  val jarClassLoader = new NonClosableMutableURLClassLoader(
+    org.apache.spark.util.Utils.getContextOrSparkClassLoader)
+
+}
+
+
+/**
+ * URL class loader that exposes the `addURL` and `getURLs` methods in URLClassLoader.
+ * This class loader cannot be closed (its `close` method is a no-op).
+ */
+private[sql] class NonClosableMutableURLClassLoader(parent: ClassLoader)
+  extends MutableURLClassLoader(Array.empty, parent) {
+
+  override def close(): Unit = {}
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -58,7 +58,12 @@ object HiveThriftServer2 extends Logging {
   @DeveloperApi
   def startWithContext(sqlContext: SQLContext): Unit = {
     val server = new HiveThriftServer2(sqlContext)
-    server.init(SparkSQLEnv.sqlContext.sharedState.asInstanceOf[HiveSharedState].executionHive.conf)
+
+    val executionHive = HiveUtils.newClientForExecution(
+      sqlContext.sparkContext.conf,
+      sqlContext.sessionState.newHadoopConf())
+
+    server.init(executionHive.conf)
     server.start()
     listener = new HiveThriftServer2Listener(server, sqlContext.conf)
     sqlContext.sparkContext.addSparkListener(listener)
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -35,7 +35,7 @@ import org.apache.hive.service.cli.session.HiveSession
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, Row => SparkRow, SQLContext}
 import org.apache.spark.sql.execution.command.SetCommand
-import org.apache.spark.sql.hive.{HiveSessionState, HiveUtils}
+import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{Utils => SparkUtils}
@@ -195,9 +195,8 @@ private[hive] class SparkExecuteStatementOperation(
     statementId = UUID.randomUUID().toString
     logInfo(s"Running query '$statement' with $statementId")
     setState(OperationState.RUNNING)
-    val sessionState = sqlContext.sessionState.asInstanceOf[HiveSessionState]
     // Always use the latest class loader provided by executionHive's state.
-    val executionHiveClassLoader = sessionState.executionHive.state.getConf.getClassLoader
+    val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader
     Thread.currentThread().setContextClassLoader(executionHiveClassLoader)
 
     HiveThriftServer2.listener.onStatementStart(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.sql.hive
 
-import java.util.regex.Pattern
-
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.Analyzer
 import org.apache.spark.sql.execution.SparkPlanner
 import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.hive.client.{HiveClient, HiveClientImpl}
+import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.SessionState
 
 
@@ -42,11 +40,6 @@ private[hive] class HiveSessionState(sparkSession: SparkSession)
     sparkSession.sharedState.asInstanceOf[HiveSharedState]
   }
 
-  /**
-   * A Hive client used for execution.
-   */
-  lazy val executionHive: HiveClientImpl = sharedState.executionHive.newSession()
-
   /**
    * A Hive client used for interacting with the metastore.
    */
@@ -61,9 +54,20 @@ private[hive] class HiveSessionState(sparkSession: SparkSession)
    *    set in the SQLConf *as well as* in the HiveConf.
    */
   lazy val hiveconf: HiveConf = {
-    val c = executionHive.conf
-    conf.setConf(c.getAllProperties)
-    c
+    val initialConf = new HiveConf(
+      sparkSession.sparkContext.hadoopConfiguration,
+      classOf[org.apache.hadoop.hive.ql.session.SessionState])
+
+    // HiveConf is a Hadoop Configuration, which has a field of classLoader and
+    // the initial value will be the current thread's context class loader
+    // (i.e. initClassLoader at here).
+    // We call initialConf.setClassLoader(initClassLoader) at here to make
+    // this action explicit.
+    initialConf.setClassLoader(sparkSession.sharedState.jarClassLoader)
+    sparkSession.sparkContext.conf.getAll.foreach { case (k, v) =>
+      initialConf.set(k, v)
+    }
+    initialConf
   }
 
   setDefaultOverrideConfs()
@@ -140,33 +144,20 @@ private[hive] class HiveSessionState(sparkSession: SparkSession)
 
   override def setConf(key: String, value: String): Unit = {
     super.setConf(key, value)
-    executionHive.runSqlHive(s"SET $key=$value")
     metadataHive.runSqlHive(s"SET $key=$value")
     hiveconf.set(key, value)
   }
 
   override def addJar(path: String): Unit = {
-    super.addJar(path)
-    executionHive.addJar(path)
     metadataHive.addJar(path)
-    Thread.currentThread().setContextClassLoader(executionHive.clientLoader.classLoader)
+    super.addJar(path)
   }
 
   /**
    * Execute a SQL statement by passing the query text directly to Hive.
    */
   override def runNativeSql(sql: String): Seq[String] = {
-    val command = sql.trim.toLowerCase
-    val functionOrMacroDDLPattern = Pattern.compile(
-      ".*(create|drop)\\s+(temporary\\s+)?(function|macro).+", Pattern.DOTALL)
-    if (functionOrMacroDDLPattern.matcher(command).matches()) {
-      executionHive.runSqlHive(sql)
-    } else if (command.startsWith("set")) {
-      metadataHive.runSqlHive(sql)
-      executionHive.runSqlHive(sql)
-    } else {
-      metadataHive.runSqlHive(sql)
-    }
+    metadataHive.runSqlHive(sql)
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala
@@ -31,13 +31,6 @@ private[hive] class HiveSharedState(override val sparkContext: SparkContext)
 
   // TODO: just share the IsolatedClientLoader instead of the client instances themselves
 
-  /**
-   * A Hive client used for execution.
-   */
-  val executionHive: HiveClientImpl = {
-    HiveUtils.newClientForExecution(sparkContext.conf, sparkContext.hadoopConfiguration)
-  }
-
   /**
    * A Hive client used to interact with the metastore.
    */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -33,6 +33,7 @@ import org.apache.spark.deploy.SparkSubmitUtils
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.sql.hive.HiveUtils
+import org.apache.spark.sql.internal.NonClosableMutableURLClassLoader
 import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
 /** Factory for `IsolatedClientLoader` with specific versions of hive. */
@@ -278,14 +279,3 @@ private[hive] class IsolatedClientLoader(
    */
   private[hive] var cachedHive: Any = null
 }
-
-/**
- * URL class loader that exposes the `addURL` and `getURLs` methods in URLClassLoader.
- * This class loader cannot be closed (its `close` method is a no-op).
- */
-private[sql] class NonClosableMutableURLClassLoader(
-    parent: ClassLoader)
-  extends MutableURLClassLoader(Array.empty, parent) {
-
-  override def close(): Unit = {}
-}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -426,7 +426,6 @@ private[hive] class TestHiveSparkSession(
       sessionState.hiveconf.set("fs.default.name", new File(".").toURI.toString)
       // It is important that we RESET first as broken hooks that might have been set could break
       // other sql exec here.
-      sessionState.executionHive.runSqlHive("RESET")
       sessionState.metadataHive.runSqlHive("RESET")
       // For some reason, RESET does not reset the following variables...
       // https://issues.apache.org/jira/browse/HIVE-9004